rebase: update K8s packages to v0.32.1

Update K8s packages in go.mod to v0.32.1

Signed-off-by: Praveen M <m.praveen@ibm.com>
This commit is contained in:
Praveen M
2025-01-16 09:41:46 +05:30
committed by mergify[bot]
parent 5aef21ea4e
commit 7eb99fc6c9
2442 changed files with 273386 additions and 47788 deletions

View File

@ -0,0 +1,123 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"errors"
"sync"
"k8s.io/apimachinery/pkg/util/sets"
)
var (
// ErrNotFound is the not found error message.
ErrNotFound = errors.New("not found")
)
// StateData is a generic type for arbitrary data stored in CycleState.
type StateData interface {
// Clone is an interface to make a copy of StateData. For performance reasons,
// clone should make shallow copies for members (e.g., slices or maps) that are not
// impacted by PreFilter's optional AddPod/RemovePod methods.
Clone() StateData
}
// StateKey is the type of keys stored in CycleState.
type StateKey string
// CycleState provides a mechanism for plugins to store and retrieve arbitrary data.
// StateData stored by one plugin can be read, altered, or deleted by another plugin.
// CycleState does not provide any data protection, as all plugins are assumed to be
// trusted.
// Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios.
// It is the recommended pattern used in all in-tree plugins - plugin-specific state is written once in PreFilter/PreScore and afterward read many times in Filter/Score.
type CycleState struct {
// storage is keyed with StateKey, and valued with StateData.
storage sync.Map
// if recordPluginMetrics is true, metrics.PluginExecutionDuration will be recorded for this cycle.
recordPluginMetrics bool
// SkipFilterPlugins are plugins that will be skipped in the Filter extension point.
SkipFilterPlugins sets.Set[string]
// SkipScorePlugins are plugins that will be skipped in the Score extension point.
SkipScorePlugins sets.Set[string]
}
// NewCycleState initializes a new CycleState and returns its pointer.
func NewCycleState() *CycleState {
return &CycleState{}
}
// ShouldRecordPluginMetrics returns whether metrics.PluginExecutionDuration metrics should be recorded.
func (c *CycleState) ShouldRecordPluginMetrics() bool {
if c == nil {
return false
}
return c.recordPluginMetrics
}
// SetRecordPluginMetrics sets recordPluginMetrics to the given value.
func (c *CycleState) SetRecordPluginMetrics(flag bool) {
if c == nil {
return
}
c.recordPluginMetrics = flag
}
// Clone creates a copy of CycleState and returns its pointer. Clone returns
// nil if the context being cloned is nil.
func (c *CycleState) Clone() *CycleState {
if c == nil {
return nil
}
copy := NewCycleState()
// Safe copy storage in case of overwriting.
c.storage.Range(func(k, v interface{}) bool {
copy.storage.Store(k, v.(StateData).Clone())
return true
})
// The below are not mutated, so we don't have to safe copy.
copy.recordPluginMetrics = c.recordPluginMetrics
copy.SkipFilterPlugins = c.SkipFilterPlugins
copy.SkipScorePlugins = c.SkipScorePlugins
return copy
}
// Read retrieves data with the given "key" from CycleState. If the key is not
// present, ErrNotFound is returned.
//
// See CycleState for notes on concurrency.
func (c *CycleState) Read(key StateKey) (StateData, error) {
if v, ok := c.storage.Load(key); ok {
return v.(StateData), nil
}
return nil, ErrNotFound
}
// Write stores the given "val" in CycleState with the given "key".
//
// See CycleState for notes on concurrency.
func (c *CycleState) Write(key StateKey, val StateData) {
c.storage.Store(key, val)
}
// Delete deletes data with the given key from CycleState.
//
// See CycleState for notes on concurrency.
func (c *CycleState) Delete(key StateKey) {
c.storage.Delete(key)
}

View File

@ -0,0 +1,229 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/kubernetes/pkg/features"
)
// Special event labels.
const (
// ScheduleAttemptFailure is the event when a schedule attempt fails.
ScheduleAttemptFailure = "ScheduleAttemptFailure"
// BackoffComplete is the event when a pod finishes backoff.
BackoffComplete = "BackoffComplete"
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
// to activeQ. Usually it's triggered by plugin implementations.
ForceActivate = "ForceActivate"
// UnschedulableTimeout is the event when a pod is moved from unschedulablePods
// due to the timeout specified at pod-max-in-unschedulable-pods-duration.
UnschedulableTimeout = "UnschedulableTimeout"
)
var (
// EventAssignedPodAdd is the event when an assigned pod is added.
EventAssignedPodAdd = ClusterEvent{Resource: assignedPod, ActionType: Add}
// EventAssignedPodUpdate is the event when an assigned pod is updated.
EventAssignedPodUpdate = ClusterEvent{Resource: assignedPod, ActionType: Update}
// EventAssignedPodDelete is the event when an assigned pod is deleted.
EventAssignedPodDelete = ClusterEvent{Resource: assignedPod, ActionType: Delete}
// EventUnscheduledPodAdd is the event when an unscheduled pod is added.
EventUnscheduledPodAdd = ClusterEvent{Resource: unschedulablePod, ActionType: Add}
// EventUnscheduledPodUpdate is the event when an unscheduled pod is updated.
EventUnscheduledPodUpdate = ClusterEvent{Resource: unschedulablePod, ActionType: Update}
// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, label: UnschedulableTimeout}
// EventForceActivate is the event when a pod is moved from unschedulablePods/backoffQ to activeQ.
EventForceActivate = ClusterEvent{Resource: WildCard, ActionType: All, label: ForceActivate}
)
// PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
// Once we have other pod update events, we should update here as well.
func PodSchedulingPropertiesChange(newPod *v1.Pod, oldPod *v1.Pod) (events []ClusterEvent) {
r := assignedPod
if newPod.Spec.NodeName == "" {
r = unschedulablePod
}
podChangeExtracters := []podChangeExtractor{
extractPodLabelsChange,
extractPodScaleDown,
extractPodSchedulingGateEliminatedChange,
extractPodTolerationChange,
}
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
podChangeExtracters = append(podChangeExtracters, extractPodGeneratedResourceClaimChange)
}
for _, fn := range podChangeExtracters {
if event := fn(newPod, oldPod); event != none {
events = append(events, ClusterEvent{Resource: r, ActionType: event})
}
}
if len(events) == 0 {
// When no specific event is found, we use AssignedPodOtherUpdate,
// which should only trigger plugins registering a general Pod/Update event.
events = append(events, ClusterEvent{Resource: r, ActionType: updatePodOther})
}
return
}
type podChangeExtractor func(newPod *v1.Pod, oldPod *v1.Pod) ActionType
// extractPodScaleDown interprets the update of a pod and returns PodRequestScaledDown event if any pod's resource request(s) is scaled down.
func extractPodScaleDown(newPod, oldPod *v1.Pod) ActionType {
opt := resource.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
}
newPodRequests := resource.PodRequests(newPod, opt)
oldPodRequests := resource.PodRequests(oldPod, opt)
for rName, oldReq := range oldPodRequests {
newReq, ok := newPodRequests[rName]
if !ok {
// The resource request of rName is removed.
return UpdatePodScaleDown
}
if oldReq.MilliValue() > newReq.MilliValue() {
// The resource request of rName is scaled down.
return UpdatePodScaleDown
}
}
return none
}
func extractPodLabelsChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if isLabelChanged(newPod.GetLabels(), oldPod.GetLabels()) {
return UpdatePodLabel
}
return none
}
func extractPodTolerationChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if len(newPod.Spec.Tolerations) != len(oldPod.Spec.Tolerations) {
// A Pod got a new toleration.
// Due to API validation, the user can add, but cannot modify or remove tolerations.
// So, it's enough to just check the length of tolerations to notice the update.
// And, any updates in tolerations could make Pod schedulable.
return UpdatePodTolerations
}
return none
}
func extractPodSchedulingGateEliminatedChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if len(newPod.Spec.SchedulingGates) == 0 && len(oldPod.Spec.SchedulingGates) != 0 {
// A scheduling gate on the pod is completely removed.
return UpdatePodSchedulingGatesEliminated
}
return none
}
func extractPodGeneratedResourceClaimChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if !resourceclaim.PodStatusEqual(newPod.Status.ResourceClaimStatuses, oldPod.Status.ResourceClaimStatuses) {
return UpdatePodGeneratedResourceClaim
}
return none
}
// NodeSchedulingPropertiesChange interprets the update of a node and returns corresponding UpdateNodeXYZ event(s).
func NodeSchedulingPropertiesChange(newNode *v1.Node, oldNode *v1.Node) (events []ClusterEvent) {
nodeChangeExtracters := []nodeChangeExtractor{
extractNodeSpecUnschedulableChange,
extractNodeAllocatableChange,
extractNodeLabelsChange,
extractNodeTaintsChange,
extractNodeConditionsChange,
extractNodeAnnotationsChange,
}
for _, fn := range nodeChangeExtracters {
if event := fn(newNode, oldNode); event != none {
events = append(events, ClusterEvent{Resource: Node, ActionType: event})
}
}
return
}
type nodeChangeExtractor func(newNode *v1.Node, oldNode *v1.Node) ActionType
func extractNodeAllocatableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) {
return UpdateNodeAllocatable
}
return none
}
func extractNodeLabelsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if isLabelChanged(newNode.GetLabels(), oldNode.GetLabels()) {
return UpdateNodeLabel
}
return none
}
func isLabelChanged(newLabels map[string]string, oldLabels map[string]string) bool {
return !equality.Semantic.DeepEqual(newLabels, oldLabels)
}
func extractNodeTaintsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(newNode.Spec.Taints, oldNode.Spec.Taints) {
return UpdateNodeTaint
}
return none
}
func extractNodeConditionsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
strip := func(conditions []v1.NodeCondition) map[v1.NodeConditionType]v1.ConditionStatus {
conditionStatuses := make(map[v1.NodeConditionType]v1.ConditionStatus, len(conditions))
for i := range conditions {
conditionStatuses[conditions[i].Type] = conditions[i].Status
}
return conditionStatuses
}
if !equality.Semantic.DeepEqual(strip(oldNode.Status.Conditions), strip(newNode.Status.Conditions)) {
return UpdateNodeCondition
}
return none
}
func extractNodeSpecUnschedulableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable && !newNode.Spec.Unschedulable {
// TODO: create UpdateNodeSpecUnschedulable ActionType
return UpdateNodeTaint
}
return none
}
func extractNodeAnnotationsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(oldNode.GetAnnotations(), newNode.GetAnnotations()) {
return UpdateNodeAnnotation
}
return none
}

View File

@ -0,0 +1,79 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
v1 "k8s.io/api/core/v1"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
)
// Extender is an interface for external processes to influence scheduling
// decisions made by Kubernetes. This is typically needed for resources not directly
// managed by Kubernetes.
type Extender interface {
// Name returns a unique name that identifies the extender.
Name() string
// Filter based on extender-implemented predicate functions. The filtered list is
// expected to be a subset of the supplied list.
// The failedNodes and failedAndUnresolvableNodes optionally contains the list
// of failed nodes and failure reasons, except nodes in the latter are
// unresolvable.
Filter(pod *v1.Pod, nodes []*NodeInfo) (filteredNodes []*NodeInfo, failedNodesMap extenderv1.FailedNodesMap, failedAndUnresolvable extenderv1.FailedNodesMap, err error)
// Prioritize based on extender-implemented priority functions. The returned scores & weight
// are used to compute the weighted score for an extender. The weighted scores are added to
// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
Prioritize(pod *v1.Pod, nodes []*NodeInfo) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error)
// Bind delegates the action of binding a pod to a node to the extender.
Bind(binding *v1.Binding) error
// IsBinder returns whether this extender is configured for the Bind method.
IsBinder() bool
// IsInterested returns true if at least one extended resource requested by
// this pod is managed by this extender.
IsInterested(pod *v1.Pod) bool
// IsPrioritizer returns whether this extender is configured for the Prioritize method.
IsPrioritizer() bool
// IsFilter returns whether this extender is configured for the Filter method.
IsFilter() bool
// ProcessPreemption returns nodes with their victim pods processed by extender based on
// given:
// 1. Pod to schedule
// 2. Candidate nodes and victim pods (nodeNameToVictims) generated by previous scheduling process.
// The possible changes made by extender may include:
// 1. Subset of given candidate nodes after preemption phase of extender.
// 2. A different set of victim pod for every given candidate node after preemption phase of extender.
ProcessPreemption(
pod *v1.Pod,
nodeNameToVictims map[string]*extenderv1.Victims,
nodeInfos NodeInfoLister,
) (map[string]*extenderv1.Victims, error)
// SupportsPreemption returns if the scheduler extender support preemption or not.
SupportsPreemption() bool
// IsIgnorable returns true indicates scheduling should not fail when this extender
// is unavailable. This gives scheduler ability to fail fast and tolerate non-critical extenders as well.
// Both Filter and Bind actions are supported.
IsIgnorable() bool
}

View File

@ -0,0 +1,954 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This file defines the scheduling framework plugin interfaces.
package framework
import (
"context"
"errors"
"math"
"strings"
"sync"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/events"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
)
// NodeScoreList declares a list of nodes and their scores.
type NodeScoreList []NodeScore
// NodeScore is a struct with node name and score.
type NodeScore struct {
Name string
Score int64
}
// NodeToStatusReader is a read-only interface of NodeToStatus passed to each PostFilter plugin.
type NodeToStatusReader interface {
// Get returns the status for given nodeName.
// If the node is not in the map, the AbsentNodesStatus is returned.
Get(nodeName string) *Status
// NodesForStatusCode returns a list of NodeInfos for the nodes that have a given status code.
// It returns the NodeInfos for all matching nodes denoted by AbsentNodesStatus as well.
NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error)
}
// NodeToStatusMap is an alias for NodeToStatusReader to keep partial backwards compatibility.
// NodeToStatusReader should be used if possible.
type NodeToStatusMap = NodeToStatusReader
// NodeToStatus contains the statuses of the Nodes where the incoming Pod was not schedulable.
type NodeToStatus struct {
// nodeToStatus contains specific statuses of the nodes.
nodeToStatus map[string]*Status
// absentNodesStatus defines a status for all nodes that are absent in nodeToStatus map.
// By default, all absent nodes are UnschedulableAndUnresolvable.
absentNodesStatus *Status
}
// NewDefaultNodeToStatus creates NodeToStatus without any node in the map.
// The absentNodesStatus is set by default to UnschedulableAndUnresolvable.
func NewDefaultNodeToStatus() *NodeToStatus {
return NewNodeToStatus(make(map[string]*Status), NewStatus(UnschedulableAndUnresolvable))
}
// NewNodeToStatus creates NodeToStatus initialized with given nodeToStatus and absentNodesStatus.
func NewNodeToStatus(nodeToStatus map[string]*Status, absentNodesStatus *Status) *NodeToStatus {
return &NodeToStatus{
nodeToStatus: nodeToStatus,
absentNodesStatus: absentNodesStatus,
}
}
// Get returns the status for given nodeName. If the node is not in the map, the absentNodesStatus is returned.
func (m *NodeToStatus) Get(nodeName string) *Status {
if status, ok := m.nodeToStatus[nodeName]; ok {
return status
}
return m.absentNodesStatus
}
// Set sets status for given nodeName.
func (m *NodeToStatus) Set(nodeName string, status *Status) {
m.nodeToStatus[nodeName] = status
}
// Len returns length of nodeToStatus map. It is not aware of number of absent nodes.
func (m *NodeToStatus) Len() int {
return len(m.nodeToStatus)
}
// AbsentNodesStatus returns absentNodesStatus value.
func (m *NodeToStatus) AbsentNodesStatus() *Status {
return m.absentNodesStatus
}
// SetAbsentNodesStatus sets absentNodesStatus value.
func (m *NodeToStatus) SetAbsentNodesStatus(status *Status) {
m.absentNodesStatus = status
}
// ForEachExplicitNode runs fn for each node which status is explicitly set.
// Imporatant note, it runs the fn only for nodes with a status explicitly registered,
// and hence may not run the fn for all existing nodes.
// For example, if PreFilter rejects all Nodes, the scheduler would NOT set a failure status to every Node,
// but set a failure status as AbsentNodesStatus.
// You're supposed to get a status from AbsentNodesStatus(), and consider all other nodes that are rejected by them.
func (m *NodeToStatus) ForEachExplicitNode(fn func(nodeName string, status *Status)) {
for nodeName, status := range m.nodeToStatus {
fn(nodeName, status)
}
}
// NodesForStatusCode returns a list of NodeInfos for the nodes that matches a given status code.
// If the absentNodesStatus matches the code, all existing nodes are fetched using nodeLister
// and filtered using NodeToStatus.Get.
// If the absentNodesStatus doesn't match the code, nodeToStatus map is used to create a list of nodes
// and nodeLister.Get is used to obtain NodeInfo for each.
func (m *NodeToStatus) NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error) {
var resultNodes []*NodeInfo
if m.AbsentNodesStatus().Code() == code {
allNodes, err := nodeLister.List()
if err != nil {
return nil, err
}
if m.Len() == 0 {
// All nodes are absent and status code is matching, so can return all nodes.
return allNodes, nil
}
// Need to find all the nodes that are absent or have a matching code using the allNodes.
for _, node := range allNodes {
nodeName := node.Node().Name
if status := m.Get(nodeName); status.Code() == code {
resultNodes = append(resultNodes, node)
}
}
return resultNodes, nil
}
m.ForEachExplicitNode(func(nodeName string, status *Status) {
if status.Code() == code {
if nodeInfo, err := nodeLister.Get(nodeName); err == nil {
resultNodes = append(resultNodes, nodeInfo)
}
}
})
return resultNodes, nil
}
// NodePluginScores is a struct with node name and scores for that node.
type NodePluginScores struct {
// Name is node name.
Name string
// Scores is scores from plugins and extenders.
Scores []PluginScore
// TotalScore is the total score in Scores.
TotalScore int64
}
// PluginScore is a struct with plugin/extender name and score.
type PluginScore struct {
// Name is the name of plugin or extender.
Name string
Score int64
}
// Code is the Status code/type which is returned from plugins.
type Code int
// These are predefined codes used in a Status.
// Note: when you add a new status, you have to add it in `codes` slice below.
const (
// Success means that plugin ran correctly and found pod schedulable.
// NOTE: A nil status is also considered as "Success".
Success Code = iota
// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
// Plugin shouldn't return this code for expected failures, like Unschedulable.
// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
Error
// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
// If it's returned from PreFilter or Filter, the scheduler might attempt to
// run other postFilter plugins like preemption to get this pod scheduled.
// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
// The accompanying status message should explain why the pod is unschedulable.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
// the Pod goes through backoff.
Unschedulable
// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
// other postFilter plugins like preemption would not change anything.
// See the comment on PostFilter interface for more details about how PostFilter should handle this status.
// Plugins should return Unschedulable if it is possible that the pod can get scheduled
// after running other postFilter plugins.
// The accompanying status message should explain why the pod is unschedulable.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling,
// the Pod goes through backoff.
UnschedulableAndUnresolvable
// Wait is used when a Permit plugin finds a pod scheduling should wait.
Wait
// Skip is used in the following scenarios:
// - when a Bind plugin chooses to skip binding.
// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
Skip
// Pending means that the scheduling process is finished successfully,
// but the plugin wants to stop the scheduling cycle/binding cycle here.
//
// For example, the DRA plugin sometimes needs to wait for the external device driver
// to provision the resource for the Pod.
// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
// because in this case, the scheduler decides where the Pod can go successfully,
// but we need to wait for the external component to do something based on that scheduling result.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// In the case of returning Pending, we cannot say the scheduling cycle is wasted
// because the scheduling result is used to proceed the Pod's scheduling forward,
// that particular scheduling cycle is failed though.
// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
// the Pod goes to activeQ directly ignoring backoff.
Pending
)
// This list should be exactly the same as the codes iota defined above in the same order.
var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"}
func (c Code) String() string {
return codes[c]
}
const (
// MaxNodeScore is the maximum score a Score plugin is expected to return.
MaxNodeScore int64 = 100
// MinNodeScore is the minimum score a Score plugin is expected to return.
MinNodeScore int64 = 0
// MaxTotalScore is the maximum total score.
MaxTotalScore int64 = math.MaxInt64
)
// PodsToActivateKey is a reserved state key for stashing pods.
// If the stashed pods are present in unschedulablePods or backoffQthey will be
// activated (i.e., moved to activeQ) in two phases:
// - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated)
// - end of a binding cycle if it succeeds
var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"
// PodsToActivate stores pods to be activated.
type PodsToActivate struct {
sync.Mutex
// Map is keyed with namespaced pod name, and valued with the pod.
Map map[string]*v1.Pod
}
// Clone just returns the same state.
func (s *PodsToActivate) Clone() StateData {
return s
}
// NewPodsToActivate instantiates a PodsToActivate object.
func NewPodsToActivate() *PodsToActivate {
return &PodsToActivate{Map: make(map[string]*v1.Pod)}
}
// Status indicates the result of running a plugin. It consists of a code, a
// message, (optionally) an error, and a plugin name it fails by.
// When the status code is not Success, the reasons should explain why.
// And, when code is Success, all the other fields should be empty.
// NOTE: A nil Status is also considered as Success.
type Status struct {
code Code
reasons []string
err error
// plugin is an optional field that records the plugin name causes this status.
// It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending.
plugin string
}
func (s *Status) WithError(err error) *Status {
s.err = err
return s
}
// Code returns code of the Status.
func (s *Status) Code() Code {
if s == nil {
return Success
}
return s.code
}
// Message returns a concatenated message on reasons of the Status.
func (s *Status) Message() string {
if s == nil {
return ""
}
return strings.Join(s.Reasons(), ", ")
}
// SetPlugin sets the given plugin name to s.plugin.
func (s *Status) SetPlugin(plugin string) {
s.plugin = plugin
}
// WithPlugin sets the given plugin name to s.plugin,
// and returns the given status object.
func (s *Status) WithPlugin(plugin string) *Status {
s.SetPlugin(plugin)
return s
}
// Plugin returns the plugin name which caused this status.
func (s *Status) Plugin() string {
return s.plugin
}
// Reasons returns reasons of the Status.
func (s *Status) Reasons() []string {
if s.err != nil {
return append([]string{s.err.Error()}, s.reasons...)
}
return s.reasons
}
// AppendReason appends given reason to the Status.
func (s *Status) AppendReason(reason string) {
s.reasons = append(s.reasons, reason)
}
// IsSuccess returns true if and only if "Status" is nil or Code is "Success".
func (s *Status) IsSuccess() bool {
return s.Code() == Success
}
// IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".
func (s *Status) IsWait() bool {
return s.Code() == Wait
}
// IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".
func (s *Status) IsSkip() bool {
return s.Code() == Skip
}
// IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).
func (s *Status) IsRejected() bool {
code := s.Code()
return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending
}
// AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object
// with a concatenated message on reasons of the Status.
func (s *Status) AsError() error {
if s.IsSuccess() || s.IsWait() || s.IsSkip() {
return nil
}
if s.err != nil {
return s.err
}
return errors.New(s.Message())
}
// Equal checks equality of two statuses. This is useful for testing with
// cmp.Equal.
func (s *Status) Equal(x *Status) bool {
if s == nil || x == nil {
return s.IsSuccess() && x.IsSuccess()
}
if s.code != x.code {
return false
}
if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) {
return false
}
if !cmp.Equal(s.reasons, x.reasons) {
return false
}
return cmp.Equal(s.plugin, x.plugin)
}
func (s *Status) String() string {
return s.Message()
}
// NewStatus makes a Status out of the given arguments and returns its pointer.
func NewStatus(code Code, reasons ...string) *Status {
s := &Status{
code: code,
reasons: reasons,
}
return s
}
// AsStatus wraps an error in a Status.
func AsStatus(err error) *Status {
if err == nil {
return nil
}
return &Status{
code: Error,
err: err,
}
}
// WaitingPod represents a pod currently waiting in the permit phase.
type WaitingPod interface {
// GetPod returns a reference to the waiting pod.
GetPod() *v1.Pod
// GetPendingPlugins returns a list of pending Permit plugin's name.
GetPendingPlugins() []string
// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
// If this is the last remaining plugin to allow, then a success signal is delivered
// to unblock the pod.
Allow(pluginName string)
// Reject declares the waiting pod unschedulable.
Reject(pluginName, msg string)
}
// Plugin is the parent type for all the scheduling framework plugins.
type Plugin interface {
Name() string
}
// PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
// These plugins are called prior to adding Pods to activeQ.
// Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
// involve expensive calls like accessing external endpoints; otherwise it'd block other
// Pods' enqueuing in event handlers.
type PreEnqueuePlugin interface {
Plugin
// PreEnqueue is called prior to adding Pods to activeQ.
PreEnqueue(ctx context.Context, p *v1.Pod) *Status
}
// LessFunc is the function to sort pod info
type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool
// QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins.
// These plugins are used to sort pods in the scheduling queue. Only one queue sort
// plugin may be enabled at a time.
type QueueSortPlugin interface {
Plugin
// Less are used to sort pods in the scheduling queue.
Less(*QueuedPodInfo, *QueuedPodInfo) bool
}
// EnqueueExtensions is an optional interface that plugins can implement to efficiently
// move unschedulable Pods in internal scheduling queues.
// In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins,
// and Pods rejected by these plugins are requeued based on this extension point.
// Failures from other extension points are regarded as temporal errors (e.g., network failure),
// and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff.
// This is because such temporal errors cannot be resolved by specific cluster events,
// and we have no choose but keep retrying scheduling until the failure is resolved.
//
// Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface,
// otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin.
// And, if plugins other than above extension points support this interface, they are just ignored.
type EnqueueExtensions interface {
Plugin
// EventsToRegister returns a series of possible events that may cause a Pod
// failed by this plugin schedulable. Each event has a callback function that
// filters out events to reduce useless retry of Pod's scheduling.
// The events will be registered when instantiating the internal scheduling queue,
// and leveraged to build event handlers dynamically.
// When it returns an error, the scheduler fails to start.
// Note: the returned list needs to be determined at a startup,
// and the scheduler only evaluates it once during start up.
// Do not change the result during runtime, for example, based on the cluster's state etc.
//
// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
EventsToRegister(context.Context) ([]ClusterEventWithHint, error)
}
// PreFilterExtensions is an interface that is included in plugins that allow specifying
// callbacks to make incremental updates to its supposedly pre-calculated
// state.
type PreFilterExtensions interface {
// AddPod is called by the framework while trying to evaluate the impact
// of adding podToAdd to the node while scheduling podToSchedule.
AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
// RemovePod is called by the framework while trying to evaluate the impact
// of removing podToRemove from the node while scheduling podToSchedule.
RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}
// PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins.
// These plugins are called at the beginning of the scheduling cycle.
type PreFilterPlugin interface {
Plugin
// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
// plugins must return success or the pod will be rejected. PreFilter could optionally
// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
// for cases where it is possible to determine the subset of nodes to process in O(1) time.
// When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable".
// i.e., those Nodes will be out of the candidates of the preemption.
//
// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
// modify its pre-processed info. The framework guarantees that the extensions
// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
// CycleState, and may call those functions more than once before calling
// Filter again on a specific node.
PreFilterExtensions() PreFilterExtensions
}
// FilterPlugin is an interface for Filter plugins. These plugins are called at the
// filter extension point for filtering out hosts that cannot run a pod.
// This concept used to be called 'predicate' in the original scheduler.
// These plugins should return "Success", "Unschedulable" or "Error" in Status.code.
// However, the scheduler accepts other valid codes as well.
// Anything other than "Success" will lead to exclusion of the given host from
// running the pod.
type FilterPlugin interface {
Plugin
// Filter is called by the scheduling framework.
// All FilterPlugins should return "Success" to declare that
// the given node fits the pod. If Filter doesn't return "Success",
// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
//
// "Error" aborts pod scheduling and puts the pod into the backoff queue.
//
// For the node being evaluated, Filter plugins should look at the passed
// nodeInfo reference for this particular node's information (e.g., pods
// considered to be running on the node) instead of looking it up in the
// NodeInfoSnapshot because we don't guarantee that they will be the same.
// For example, during preemption, we may pass a copy of the original
// nodeInfo object that has some pods removed from it to evaluate the
// possibility of preempting them to schedule the target pod.
Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
}
// PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called
// after a pod cannot be scheduled.
type PostFilterPlugin interface {
Plugin
// PostFilter is called by the scheduling framework
// when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable.
// NodeToStatusReader has statuses that each Node got in PreFilter or Filter phase.
//
// If you're implementing a custom preemption with PostFilter, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of your plugin,
// meaning NodeToStatusReader could have Nodes with UnschedulableAndUnresolvable
// and the scheduling framework does call PostFilter plugins even when all Nodes in NodeToStatusReader are UnschedulableAndUnresolvable.
//
// A PostFilter plugin should return one of the following statuses:
// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
// - Success: the plugin gets executed successfully and the pod can be made schedulable.
// - Error: the plugin aborts due to some internal error.
//
// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
// preemptor pod's .spec.status.nominatedNodeName field.
PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
}
// PreScorePlugin is an interface for "PreScore" plugin. PreScore is an
// informational extension point. Plugins will be called with a list of nodes
// that passed the filtering phase. A plugin may use this data to update internal
// state or to generate logs/metrics.
type PreScorePlugin interface {
Plugin
// PreScore is called by the scheduling framework after a list of nodes
// passed the filtering phase. All prescore plugins must return success or
// the pod will be rejected
// When it returns Skip status, other fields in status are just ignored,
// and coupled Score plugin will be skipped in this scheduling cycle.
PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status
}
// ScoreExtensions is an interface for Score extended functionality.
type ScoreExtensions interface {
// NormalizeScore is called for all node scores produced by the same plugin's "Score"
// method. A successful run of NormalizeScore will update the scores list and return
// a success status.
NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
}
// ScorePlugin is an interface that must be implemented by "Score" plugins to rank
// nodes that passed the filtering phase.
type ScorePlugin interface {
Plugin
// Score is called on each filtered node. It must return success and an integer
// indicating the rank of the node. All scoring plugins must return success or
// the pod will be rejected.
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
ScoreExtensions() ScoreExtensions
}
// ReservePlugin is an interface for plugins with Reserve and Unreserve
// methods. These are meant to update the state of the plugin. This concept
// used to be called 'assume' in the original scheduler. These plugins should
// return only Success or Error in Status.code. However, the scheduler accepts
// other valid codes as well. Anything other than Success will lead to
// rejection of the pod.
type ReservePlugin interface {
Plugin
// Reserve is called by the scheduling framework when the scheduler cache is
// updated. If this method returns a failed Status, the scheduler will call
// the Unreserve method for all enabled ReservePlugins.
Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
// Unreserve is called by the scheduling framework when a reserved pod was
// rejected, an error occurred during reservation of subsequent plugins, or
// in a later phase. The Unreserve method implementation must be idempotent
// and may be called by the scheduler even if the corresponding Reserve
// method for the same plugin was not called.
Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}
// PreBindPlugin is an interface that must be implemented by "PreBind" plugins.
// These plugins are called before a pod being scheduled.
type PreBindPlugin interface {
Plugin
// PreBind is called before binding a pod. All prebind plugins must return
// success or the pod will be rejected and won't be sent for binding.
PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}
// PostBindPlugin is an interface that must be implemented by "PostBind" plugins.
// These plugins are called after a pod is successfully bound to a node.
type PostBindPlugin interface {
Plugin
// PostBind is called after a pod is successfully bound. These plugins are
// informational. A common application of this extension point is for cleaning
// up. If a plugin needs to clean-up its state after a pod is scheduled and
// bound, PostBind is the extension point that it should register.
PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}
// PermitPlugin is an interface that must be implemented by "Permit" plugins.
// These plugins are called before a pod is bound to a node.
type PermitPlugin interface {
Plugin
// Permit is called before binding a pod (and before prebind plugins). Permit
// plugins are used to prevent or delay the binding of a Pod. A permit plugin
// must return success or wait with timeout duration, or the pod will be rejected.
// The pod will also be rejected if the wait timeout or the pod is rejected while
// waiting. Note that if the plugin returns "wait", the framework will wait only
// after running the remaining plugins given that no other plugin rejects the pod.
Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
}
// BindPlugin is an interface that must be implemented by "Bind" plugins. Bind
// plugins are used to bind a pod to a Node.
type BindPlugin interface {
Plugin
// Bind plugins will not be called until all pre-bind plugins have completed. Each
// bind plugin is called in the configured order. A bind plugin may choose whether
// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
// it must return Skip in its Status code. If a bind plugin returns an Error, the
// pod is rejected and will not be bound.
Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}
// Framework manages the set of plugins in use by the scheduling framework.
// Configured plugins are called at specified points in a scheduling context.
type Framework interface {
Handle
// PreEnqueuePlugins returns the registered preEnqueue plugins.
PreEnqueuePlugins() []PreEnqueuePlugin
// EnqueueExtensions returns the registered Enqueue extensions.
EnqueueExtensions() []EnqueueExtensions
// QueueSortFunc returns the function to sort pods in scheduling queue
QueueSortFunc() LessFunc
// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
// *Status and its code is set to non-success if any of the plugins returns
// anything but Success. If a non-success status is returned, then the scheduling
// cycle is aborted.
// It also returns a PreFilterResult, which may influence what or how many nodes to
// evaluate downstream.
// The third returns value contains PreFilter plugin that rejected some or all Nodes with PreFilterResult.
// But, note that it doesn't contain any plugin when a plugin rejects this Pod with non-success status,
// not with PreFilterResult.
RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status, sets.Set[string])
// RunPostFilterPlugins runs the set of configured PostFilter plugins.
// PostFilter plugins can either be informational, in which case should be configured
// to execute first and return Unschedulable status, or ones that try to change the
// cluster state to make the pod potentially schedulable in a future scheduling cycle.
RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
// *Status and its code is set to non-success if any of the plugins returns
// anything but Success. If the Status code is "Unschedulable", it is
// considered as a scheduling check failure, otherwise, it is considered as an
// internal error. In either case the pod is not going to be bound.
RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// RunPostBindPlugins runs the set of configured PostBind plugins.
RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
// RunReservePluginsReserve runs the Reserve method of the set of
// configured Reserve plugins. If any of these calls returns an error, it
// does not continue running the remaining ones and returns the error. In
// such case, pod will not be scheduled.
RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// RunReservePluginsUnreserve runs the Unreserve method of the set of
// configured Reserve plugins.
RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
// RunPermitPlugins runs the set of configured Permit plugins. If any of these
// plugins returns a status other than "Success" or "Wait", it does not continue
// running the remaining plugins and returns an error. Otherwise, if any of the
// plugins returns "Wait", then this function will create and add waiting pod
// to a map of currently waiting pods and return status with "Wait" code.
// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status
// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
// or "Success". If none of the plugins handled binding, RunBindPlugins returns
// code=5("skip") status.
RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// HasFilterPlugins returns true if at least one Filter plugin is defined.
HasFilterPlugins() bool
// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
HasPostFilterPlugins() bool
// HasScorePlugins returns true if at least one Score plugin is defined.
HasScorePlugins() bool
// ListPlugins returns a map of extension point name to list of configured Plugins.
ListPlugins() *config.Plugins
// ProfileName returns the profile name associated to a profile.
ProfileName() string
// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
PercentageOfNodesToScore() *int32
// SetPodNominator sets the PodNominator
SetPodNominator(nominator PodNominator)
// SetPodActivator sets the PodActivator
SetPodActivator(activator PodActivator)
// Close calls Close method of each plugin.
Close() error
}
// Handle provides data and some tools that plugins can use. It is
// passed to the plugin factories at the time of plugin initialization. Plugins
// must store and use this handle to call framework functions.
type Handle interface {
// PodNominator abstracts operations to maintain nominated Pods.
PodNominator
// PluginsRunner abstracts operations to run some plugins.
PluginsRunner
// PodActivator abstracts operations in the scheduling queue.
PodActivator
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
// is taken at the beginning of a scheduling cycle and remains unchanged until
// a pod finishes "Permit" point.
//
// It should be used only during scheduling cycle:
// - There is no guarantee that the information remains unchanged in the binding phase of scheduling.
// So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin)
// otherwise, a concurrent read/write error might occur.
// - There is no guarantee that the information is always up-to-date.
// So, plugins shouldn't use it in QueueingHint and PreEnqueue
// otherwise, they might make a decision based on stale information.
//
// Instead, they should use the resources getting from Informer created from SharedInformerFactory().
SnapshotSharedLister() SharedLister
// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
IterateOverWaitingPods(callback func(WaitingPod))
// GetWaitingPod returns a waiting pod given its UID.
GetWaitingPod(uid types.UID) WaitingPod
// RejectWaitingPod rejects a waiting pod given its UID.
// The return value indicates if the pod is waiting or not.
RejectWaitingPod(uid types.UID) bool
// ClientSet returns a kubernetes clientSet.
ClientSet() clientset.Interface
// KubeConfig returns the raw kube config.
KubeConfig() *restclient.Config
// EventRecorder returns an event recorder.
EventRecorder() events.EventRecorder
SharedInformerFactory() informers.SharedInformerFactory
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
// A non-default implementation can be plugged into the framework to simulate the state of DRA objects.
SharedDRAManager() SharedDRAManager
// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status
// Extenders returns registered scheduler extenders.
Extenders() []Extender
// Parallelizer returns a parallelizer holding parallelism for scheduler.
Parallelizer() parallelize.Parallelizer
}
// PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.
type PreFilterResult struct {
// The set of nodes that should be considered downstream; if nil then
// all nodes are eligible.
NodeNames sets.Set[string]
}
func (p *PreFilterResult) AllNodes() bool {
return p == nil || p.NodeNames == nil
}
func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult {
if p.AllNodes() && in.AllNodes() {
return nil
}
r := PreFilterResult{}
if p.AllNodes() {
r.NodeNames = in.NodeNames.Clone()
return &r
}
if in.AllNodes() {
r.NodeNames = p.NodeNames.Clone()
return &r
}
r.NodeNames = p.NodeNames.Intersection(in.NodeNames)
return &r
}
type NominatingMode int
const (
ModeNoop NominatingMode = iota
ModeOverride
)
type NominatingInfo struct {
NominatedNodeName string
NominatingMode NominatingMode
}
// PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.
type PostFilterResult struct {
*NominatingInfo
}
func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult {
return &PostFilterResult{
NominatingInfo: &NominatingInfo{
NominatedNodeName: name,
NominatingMode: ModeOverride,
},
}
}
func (ni *NominatingInfo) Mode() NominatingMode {
if ni == nil {
return ModeNoop
}
return ni.NominatingMode
}
// PodActivator abstracts operations in the scheduling queue.
type PodActivator interface {
// Activate moves the given pods to activeQ.
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
// the wildcard event is registered so that the pod will be requeued when it comes back.
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
// Activate would ignore the pod.
Activate(logger klog.Logger, pods map[string]*v1.Pod)
}
// PodNominator abstracts operations to maintain nominated Pods.
type PodNominator interface {
// AddNominatedPod adds the given pod to the nominator or
// updates it if it already exists.
AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
DeleteNominatedPodIfExists(pod *v1.Pod)
// UpdateNominatedPod updates the <oldPod> with <newPod>.
UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
// NominatedPodsForNode returns nominatedPods on the given node.
NominatedPodsForNode(nodeName string) []*PodInfo
}
// PluginsRunner abstracts operations to run some plugins.
// This is used by preemption PostFilter plugins when evaluating the feasibility of
// scheduling the pod on nodes when certain running pods get evicted.
type PluginsRunner interface {
// RunPreScorePlugins runs the set of configured PreScore plugins. If any
// of these plugins returns any status other than "Success", the given pod is rejected.
RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status
// RunScorePlugins runs the set of configured scoring plugins.
// It returns a list that stores scores from each plugin and total score for each Node.
// It also returns *Status, which is set to non-success if any of the plugins returns
// a non-success status.
RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status)
// RunFilterPlugins runs the set of configured Filter plugins for pod on
// the given node. Note that for the node being evaluated, the passed nodeInfo
// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
// considered to be running on the node could be different). For example, during
// preemption, we may pass a copy of the original nodeInfo object that has some pods
// removed from it to evaluate the possibility of preempting them to
// schedule the target pod.
RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
// PreFilter plugins. It returns directly if any of the plugins return any
// status other than Success.
RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
// PreFilter plugins. It returns directly if any of the plugins return any
// status other than Success.
RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}

View File

@ -0,0 +1,111 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/dynamic-resource-allocation/structured"
)
// NodeInfoLister interface represents anything that can list/get NodeInfo objects from node name.
type NodeInfoLister interface {
// List returns the list of NodeInfos.
List() ([]*NodeInfo, error)
// HavePodsWithAffinityList returns the list of NodeInfos of nodes with pods with affinity terms.
HavePodsWithAffinityList() ([]*NodeInfo, error)
// HavePodsWithRequiredAntiAffinityList returns the list of NodeInfos of nodes with pods with required anti-affinity terms.
HavePodsWithRequiredAntiAffinityList() ([]*NodeInfo, error)
// Get returns the NodeInfo of the given node name.
Get(nodeName string) (*NodeInfo, error)
}
// StorageInfoLister interface represents anything that handles storage-related operations and resources.
type StorageInfoLister interface {
// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
// keyed in the format "namespace/name".
IsPVCUsedByPods(key string) bool
}
// SharedLister groups scheduler-specific listers.
type SharedLister interface {
NodeInfos() NodeInfoLister
StorageInfos() StorageInfoLister
}
// ResourceSliceLister can be used to obtain ResourceSlices.
type ResourceSliceLister interface {
// List returns a list of all ResourceSlices.
List() ([]*resourceapi.ResourceSlice, error)
}
// DeviceClassLister can be used to obtain DeviceClasses.
type DeviceClassLister interface {
// List returns a list of all DeviceClasses.
List() ([]*resourceapi.DeviceClass, error)
// Get returns the DeviceClass with the given className.
Get(className string) (*resourceapi.DeviceClass, error)
}
// ResourceClaimTracker can be used to obtain ResourceClaims, and track changes to ResourceClaims in-memory.
//
// If the claims are meant to be allocated in the API during the binding phase (when used by scheduler), the tracker helps avoid
// race conditions between scheduling and binding phases (as well as between the binding phase and the informer cache update).
//
// If the binding phase is not run (e.g. when used by Cluster Autoscaler which only runs the scheduling phase, and simulates binding in-memory),
// the tracker allows the framework user to obtain the claim allocations produced by the DRA plugin, and persist them outside of the API (e.g. in-memory).
type ResourceClaimTracker interface {
// List lists ResourceClaims. The result is guaranteed to immediately include any changes made via AssumeClaimAfterAPICall(),
// and SignalClaimPendingAllocation().
List() ([]*resourceapi.ResourceClaim, error)
// Get works like List(), but for a single claim.
Get(namespace, claimName string) (*resourceapi.ResourceClaim, error)
// ListAllAllocatedDevices lists all allocated Devices from allocated ResourceClaims. The result is guaranteed to immediately include
// any changes made via AssumeClaimAfterAPICall(), and SignalClaimPendingAllocation().
ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error)
// SignalClaimPendingAllocation signals to the tracker that the given ResourceClaim will be allocated via an API call in the
// binding phase. This change is immediately reflected in the result of List() and the other accessors.
SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error
// ClaimHasPendingAllocation answers whether a given claim has a pending allocation during the binding phase. It can be used to avoid
// race conditions in subsequent scheduling phases.
ClaimHasPendingAllocation(claimUID types.UID) bool
// RemoveClaimPendingAllocation removes the pending allocation for the given ResourceClaim from the tracker if any was signaled via
// SignalClaimPendingAllocation(). Returns whether there was a pending allocation to remove. List() and the other accessors immediately
// stop reflecting the pending allocation in the results.
RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool)
// AssumeClaimAfterAPICall signals to the tracker that an API call modifying the given ResourceClaim was made in the binding phase, and the
// changes should be reflected in informers very soon. This change is immediately reflected in the result of List() and the other accessors.
// This mechanism can be used to avoid race conditions between the informer update and subsequent scheduling phases.
AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error
// AssumedClaimRestore signals to the tracker that something went wrong with the API call modifying the given ResourceClaim, and
// the changes won't be reflected in informers after all. List() and the other accessors immediately stop reflecting the assumed change,
// and go back to the informer version.
AssumedClaimRestore(namespace, claimName string)
}
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
// The plugin's default implementation obtains the objects from the API. A different implementation can be
// plugged into the framework in order to simulate the state of DRA objects. For example, Cluster Autoscaler
// can use this to provide the correct DRA object state to the DRA plugin when simulating scheduling changes in-memory.
type SharedDRAManager interface {
ResourceClaims() ResourceClaimTracker
ResourceSlices() ResourceSliceLister
DeviceClasses() DeviceClassLister
}

View File

@ -0,0 +1,59 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package parallelize
import "context"
// ErrorChannel supports non-blocking send and receive operation to capture error.
// A maximum of one error is kept in the channel and the rest of the errors sent
// are ignored, unless the existing error is received and the channel becomes empty
// again.
type ErrorChannel struct {
errCh chan error
}
// SendError sends an error without blocking the sender.
func (e *ErrorChannel) SendError(err error) {
select {
case e.errCh <- err:
default:
}
}
// SendErrorWithCancel sends an error without blocking the sender and calls
// cancel function.
func (e *ErrorChannel) SendErrorWithCancel(err error, cancel context.CancelFunc) {
e.SendError(err)
cancel()
}
// ReceiveError receives an error from channel without blocking on the receiver.
func (e *ErrorChannel) ReceiveError() error {
select {
case err := <-e.errCh:
return err
default:
return nil
}
}
// NewErrorChannel returns a new ErrorChannel.
func NewErrorChannel() *ErrorChannel {
return &ErrorChannel{
errCh: make(chan error, 1),
}
}

View File

@ -0,0 +1,65 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package parallelize
import (
"context"
"math"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
// DefaultParallelism is the default parallelism used in scheduler.
const DefaultParallelism int = 16
// Parallelizer holds the parallelism for scheduler.
type Parallelizer struct {
parallelism int
}
// NewParallelizer returns an object holding the parallelism.
func NewParallelizer(p int) Parallelizer {
return Parallelizer{parallelism: p}
}
// chunkSizeFor returns a chunk size for the given number of items to use for
// parallel work. The size aims to produce good CPU utilization.
// returns max(1, min(sqrt(n), n/Parallelism))
func chunkSizeFor(n, parallelism int) int {
s := int(math.Sqrt(float64(n)))
if r := n/parallelism + 1; s > r {
s = r
} else if s < 1 {
s = 1
}
return s
}
// Until is a wrapper around workqueue.ParallelizeUntil to use in scheduling algorithms.
// A given operation will be a label that is recorded in the goroutine metric.
func (p Parallelizer) Until(ctx context.Context, pieces int, doWorkPiece workqueue.DoWorkPieceFunc, operation string) {
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
withMetrics := func(piece int) {
goroutinesMetric.Inc()
doWorkPiece(piece)
goroutinesMetric.Dec()
}
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, withMetrics, workqueue.WithChunkSize(chunkSizeFor(pieces, p.parallelism)))
}

View File

@ -0,0 +1,3 @@
# Scheduler Framework Plugins
Moved [here](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-scheduling/scheduler_framework_plugins.md).

View File

@ -0,0 +1,63 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package defaultbinder
import (
"context"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.DefaultBinder
// DefaultBinder binds pods to nodes using a k8s client.
type DefaultBinder struct {
handle framework.Handle
}
var _ framework.BindPlugin = &DefaultBinder{}
// New creates a DefaultBinder.
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
return &DefaultBinder{handle: handle}, nil
}
// Name returns the name of the plugin.
func (b DefaultBinder) Name() string {
return Name
}
// Bind binds pods to nodes using the k8s client.
func (b DefaultBinder) Bind(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) *framework.Status {
logger := klog.FromContext(ctx)
logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(p), "node", klog.KRef("", nodeName))
binding := &v1.Binding{
ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID},
Target: v1.ObjectReference{Kind: "Node", Name: nodeName},
}
err := b.handle.ClientSet().CoreV1().Pods(binding.Namespace).Bind(ctx, binding, metav1.CreateOptions{})
if err != nil {
return framework.AsStatus(err)
}
return nil
}

View File

@ -0,0 +1,364 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package defaultpreemption
import (
"context"
"fmt"
"math/rand"
"sort"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/framework/preemption"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.DefaultPreemption
// DefaultPreemption is a PostFilter plugin implements the preemption logic.
type DefaultPreemption struct {
fh framework.Handle
fts feature.Features
args config.DefaultPreemptionArgs
podLister corelisters.PodLister
pdbLister policylisters.PodDisruptionBudgetLister
Evaluator *preemption.Evaluator
}
var _ framework.PostFilterPlugin = &DefaultPreemption{}
var _ framework.PreEnqueuePlugin = &DefaultPreemption{}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *DefaultPreemption) Name() string {
return Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := dpArgs.(*config.DefaultPreemptionArgs)
if !ok {
return nil, fmt.Errorf("got args of type %T, want *DefaultPreemptionArgs", dpArgs)
}
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
return nil, err
}
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
pdbLister := getPDBLister(fh.SharedInformerFactory())
pl := DefaultPreemption{
fh: fh,
fts: fts,
args: *args,
podLister: podLister,
pdbLister: pdbLister,
}
pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
return &pl, nil
}
// PostFilter invoked at the postFilter extension point.
func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
defer func() {
metrics.PreemptionAttempts.Inc()
}()
result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
msg := status.Message()
if len(msg) > 0 {
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
}
return result, status
}
func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
if !pl.fts.EnableAsyncPreemption {
return nil
}
if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
}
return nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// The plugin moves the preemptor Pod to acviteQ/backoffQ once the preemption API calls are all done,
// and we don't need to move the Pod with any events.
return nil, nil
}
// calculateNumCandidates returns the number of candidates the FindCandidates
// method must produce from dry running based on the constraints given by
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
// candidates returned will never be greater than <numNodes>.
func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 {
n := (numNodes * pl.args.MinCandidateNodesPercentage) / 100
if n < pl.args.MinCandidateNodesAbsolute {
n = pl.args.MinCandidateNodesAbsolute
}
if n > numNodes {
n = numNodes
}
return n
}
// GetOffsetAndNumCandidates chooses a random offset and calculates the number
// of candidates that should be shortlisted for dry running preemption.
func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) {
return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes)
}
// This function is not applicable for out-of-tree preemption plugins that exercise
// different preemption candidates on the same nominated node.
func (pl *DefaultPreemption) CandidatesToVictimsMap(candidates []preemption.Candidate) map[string]*extenderv1.Victims {
m := make(map[string]*extenderv1.Victims, len(candidates))
for _, c := range candidates {
m[c.Name()] = c.Victims()
}
return m
}
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
// for "pod" to be scheduled.
func (pl *DefaultPreemption) SelectVictimsOnNode(
ctx context.Context,
state *framework.CycleState,
pod *v1.Pod,
nodeInfo *framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status) {
logger := klog.FromContext(ctx)
var potentialVictims []*framework.PodInfo
removePod := func(rpi *framework.PodInfo) error {
if err := nodeInfo.RemovePod(logger, rpi.Pod); err != nil {
return err
}
status := pl.fh.RunPreFilterExtensionRemovePod(ctx, state, pod, rpi, nodeInfo)
if !status.IsSuccess() {
return status.AsError()
}
return nil
}
addPod := func(api *framework.PodInfo) error {
nodeInfo.AddPodInfo(api)
status := pl.fh.RunPreFilterExtensionAddPod(ctx, state, pod, api, nodeInfo)
if !status.IsSuccess() {
return status.AsError()
}
return nil
}
// As the first step, remove all the lower priority pods from the node and
// check if the given pod can be scheduled.
podPriority := corev1helpers.PodPriority(pod)
for _, pi := range nodeInfo.Pods {
if corev1helpers.PodPriority(pi.Pod) < podPriority {
potentialVictims = append(potentialVictims, pi)
if err := removePod(pi); err != nil {
return nil, 0, framework.AsStatus(err)
}
}
}
// No potential victims are found, and so we don't need to evaluate the node again since its state didn't change.
if len(potentialVictims) == 0 {
return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, "No preemption victims found for incoming pod")
}
// If the new pod does not fit after removing all the lower priority pods,
// we are almost done and this node is not suitable for preemption. The only
// condition that we could check is if the "pod" is failing to schedule due to
// inter-pod affinity to one or more victims, but we have decided not to
// support this case for performance reasons. Having affinity to lower
// priority pods is not a recommended configuration anyway.
if status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo); !status.IsSuccess() {
return nil, 0, status
}
var victims []*v1.Pod
numViolatingVictim := 0
// Sort potentialVictims by pod priority from high to low, which ensures to
// reprieve higher priority pods first.
sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) })
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
// violating victims and then other non-violating ones. In both cases, we start
// from the highest priority victims.
violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs)
reprievePod := func(pi *framework.PodInfo) (bool, error) {
if err := addPod(pi); err != nil {
return false, err
}
status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
fits := status.IsSuccess()
if !fits {
if err := removePod(pi); err != nil {
return false, err
}
rpi := pi.Pod
victims = append(victims, rpi)
logger.V(5).Info("Pod is a potential preemption victim on node", "pod", klog.KObj(rpi), "node", klog.KObj(nodeInfo.Node()))
}
return fits, nil
}
for _, p := range violatingVictims {
if fits, err := reprievePod(p); err != nil {
return nil, 0, framework.AsStatus(err)
} else if !fits {
numViolatingVictim++
}
}
// Now we try to reprieve non-violating victims.
for _, p := range nonViolatingVictims {
if _, err := reprievePod(p); err != nil {
return nil, 0, framework.AsStatus(err)
}
}
// Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low.
if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 {
sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) })
}
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
}
// PodEligibleToPreemptOthers returns one bool and one string. The bool
// indicates whether this pod should be considered for preempting other pods or
// not. The string includes the reason if this pod isn't eligible.
// There're several reasons:
// 1. The pod has a preemptionPolicy of Never.
// 2. The pod has already preempted other pods and the victims are in their graceful termination period.
// Currently we check the node that is nominated for this pod, and as long as there are
// terminating pods on this node, we don't attempt to preempt more pods.
func (pl *DefaultPreemption) PodEligibleToPreemptOthers(_ context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string) {
if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever {
return false, "not eligible due to preemptionPolicy=Never."
}
nodeInfos := pl.fh.SnapshotSharedLister().NodeInfos()
nomNodeName := pod.Status.NominatedNodeName
if len(nomNodeName) > 0 {
// If the pod's nominated node is considered as UnschedulableAndUnresolvable by the filters,
// then the pod should be considered for preempting again.
if nominatedNodeStatus.Code() == framework.UnschedulableAndUnresolvable {
return true, ""
}
if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil {
podPriority := corev1helpers.PodPriority(pod)
for _, p := range nodeInfo.Pods {
if corev1helpers.PodPriority(p.Pod) < podPriority && podTerminatingByPreemption(p.Pod) {
// There is a terminating pod on the nominated node.
return false, "not eligible due to a terminating pod on the nominated node."
}
}
}
}
return true, ""
}
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
func (pl *DefaultPreemption) OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64 {
return nil
}
// podTerminatingByPreemption returns true if the pod is in the termination state caused by scheduler preemption.
func podTerminatingByPreemption(p *v1.Pod) bool {
if p.DeletionTimestamp == nil {
return false
}
for _, condition := range p.Status.Conditions {
if condition.Type == v1.DisruptionTarget {
return condition.Status == v1.ConditionTrue && condition.Reason == v1.PodReasonPreemptionByScheduler
}
}
return false
}
// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
// and "nonViolatingPods" based on whether their PDBs will be violated if they are
// preempted.
// This function is stable and does not change the order of received pods. So, if it
// receives a sorted list, grouping will preserve the order of the input list.
func filterPodsWithPDBViolation(podInfos []*framework.PodInfo, pdbs []*policy.PodDisruptionBudget) (violatingPodInfos, nonViolatingPodInfos []*framework.PodInfo) {
pdbsAllowed := make([]int32, len(pdbs))
for i, pdb := range pdbs {
pdbsAllowed[i] = pdb.Status.DisruptionsAllowed
}
for _, podInfo := range podInfos {
pod := podInfo.Pod
pdbForPodIsViolated := false
// A pod with no labels will not match any PDB. So, no need to check.
if len(pod.Labels) != 0 {
for i, pdb := range pdbs {
if pdb.Namespace != pod.Namespace {
continue
}
selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
if err != nil {
// This object has an invalid selector, it does not match the pod
continue
}
// A PDB with a nil or empty selector matches nothing.
if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
continue
}
// Existing in DisruptedPods means it has been processed in API server,
// we don't treat it as a violating case.
if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist {
continue
}
// Only decrement the matched pdb when it's not in its <DisruptedPods>;
// otherwise we may over-decrement the budget number.
pdbsAllowed[i]--
// We have found a matching PDB.
if pdbsAllowed[i] < 0 {
pdbForPodIsViolated = true
}
}
}
if pdbForPodIsViolated {
violatingPodInfos = append(violatingPodInfos, podInfo)
} else {
nonViolatingPodInfos = append(nonViolatingPodInfos, podInfo)
}
}
return violatingPodInfos, nonViolatingPodInfos
}
func getPDBLister(informerFactory informers.SharedInformerFactory) policylisters.PodDisruptionBudgetLister {
return informerFactory.Policy().V1().PodDisruptionBudgets().Lister()
}

View File

@ -0,0 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- klueska
- pohly
- bart0sh
labels:
- sig/node
- wg/device-management

View File

@ -0,0 +1,175 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
"k8s.io/utils/ptr"
)
// foreachAllocatedDevice invokes the provided callback for each
// device in the claim's allocation result which was allocated
// exclusively for the claim.
//
// Devices allocated with admin access can be shared with other
// claims and are skipped without invoking the callback.
//
// foreachAllocatedDevice does nothing if the claim is not allocated.
func foreachAllocatedDevice(claim *resourceapi.ResourceClaim, cb func(deviceID structured.DeviceID)) {
if claim.Status.Allocation == nil {
return
}
for _, result := range claim.Status.Allocation.Devices.Results {
// Kubernetes 1.31 did not set this, 1.32 always does.
// Supporting 1.31 is not worth the additional code that
// would have to be written (= looking up in request) because
// it is extremely unlikely that there really is a result
// that still exists in a cluster from 1.31 where this matters.
if ptr.Deref(result.AdminAccess, false) {
// Is not considered as allocated.
continue
}
deviceID := structured.MakeDeviceID(result.Driver, result.Pool, result.Device)
// None of the users of this helper need to abort iterating,
// therefore it's not supported as it only would add overhead.
cb(deviceID)
}
}
// allocatedDevices reacts to events in a cache and maintains a set of all allocated devices.
// This is cheaper than repeatedly calling List, making strings unique, and building the set
// each time PreFilter is called.
//
// All methods are thread-safe. Get returns a cloned set.
type allocatedDevices struct {
logger klog.Logger
mutex sync.RWMutex
ids sets.Set[structured.DeviceID]
}
func newAllocatedDevices(logger klog.Logger) *allocatedDevices {
return &allocatedDevices{
logger: logger,
ids: sets.New[structured.DeviceID](),
}
}
func (a *allocatedDevices) Get() sets.Set[structured.DeviceID] {
a.mutex.RLock()
defer a.mutex.RUnlock()
return a.ids.Clone()
}
func (a *allocatedDevices) handlers() cache.ResourceEventHandler {
return cache.ResourceEventHandlerFuncs{
AddFunc: a.onAdd,
UpdateFunc: a.onUpdate,
DeleteFunc: a.onDelete,
}
}
func (a *allocatedDevices) onAdd(obj any) {
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onAdd")
return
}
if claim.Status.Allocation != nil {
a.addDevices(claim)
}
}
func (a *allocatedDevices) onUpdate(oldObj, newObj any) {
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onUpdate")
return
}
switch {
case originalClaim.Status.Allocation == nil && modifiedClaim.Status.Allocation != nil:
a.addDevices(modifiedClaim)
case originalClaim.Status.Allocation != nil && modifiedClaim.Status.Allocation == nil:
a.removeDevices(originalClaim)
default:
// Nothing to do. Either both nil or both non-nil, in which case the content
// also must be the same (immutable!).
}
}
func (a *allocatedDevices) onDelete(obj any) {
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onDelete")
return
}
a.removeDevices(claim)
}
func (a *allocatedDevices) addDevices(claim *resourceapi.ResourceClaim) {
if claim.Status.Allocation == nil {
return
}
// Locking of the mutex gets minimized by pre-computing what needs to be done
// without holding the lock.
deviceIDs := make([]structured.DeviceID, 0, 20)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
a.logger.V(6).Info("Observed device allocation", "device", deviceID, "claim", klog.KObj(claim))
deviceIDs = append(deviceIDs, deviceID)
})
a.mutex.Lock()
defer a.mutex.Unlock()
for _, deviceID := range deviceIDs {
a.ids.Insert(deviceID)
}
}
func (a *allocatedDevices) removeDevices(claim *resourceapi.ResourceClaim) {
if claim.Status.Allocation == nil {
return
}
// Locking of the mutex gets minimized by pre-computing what needs to be done
// without holding the lock.
deviceIDs := make([]structured.DeviceID, 0, 20)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
a.logger.V(6).Info("Observed device deallocation", "device", deviceID, "claim", klog.KObj(claim))
deviceIDs = append(deviceIDs, deviceID)
})
a.mutex.Lock()
defer a.mutex.Unlock()
for _, deviceID := range deviceIDs {
a.ids.Delete(deviceID)
}
}

View File

@ -0,0 +1,226 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"context"
"fmt"
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
var _ framework.SharedDRAManager = &DefaultDRAManager{}
// DefaultDRAManager is the default implementation of SharedDRAManager. It obtains the DRA objects
// from API informers, and uses an AssumeCache and a map of in-flight allocations in order
// to avoid race conditions when modifying ResourceClaims.
type DefaultDRAManager struct {
resourceClaimTracker *claimTracker
resourceSliceLister *resourceSliceLister
deviceClassLister *deviceClassLister
}
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
logger := klog.FromContext(ctx)
manager := &DefaultDRAManager{
resourceClaimTracker: &claimTracker{
cache: claimsCache,
inFlightAllocations: &sync.Map{},
allocatedDevices: newAllocatedDevices(logger),
logger: logger,
},
resourceSliceLister: &resourceSliceLister{sliceLister: informerFactory.Resource().V1beta1().ResourceSlices().Lister()},
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1beta1().DeviceClasses().Lister()},
}
// Reacting to events is more efficient than iterating over the list
// repeatedly in PreFilter.
manager.resourceClaimTracker.cache.AddEventHandler(manager.resourceClaimTracker.allocatedDevices.handlers())
return manager
}
func (s *DefaultDRAManager) ResourceClaims() framework.ResourceClaimTracker {
return s.resourceClaimTracker
}
func (s *DefaultDRAManager) ResourceSlices() framework.ResourceSliceLister {
return s.resourceSliceLister
}
func (s *DefaultDRAManager) DeviceClasses() framework.DeviceClassLister {
return s.deviceClassLister
}
var _ framework.ResourceSliceLister = &resourceSliceLister{}
type resourceSliceLister struct {
sliceLister resourcelisters.ResourceSliceLister
}
func (l *resourceSliceLister) List() ([]*resourceapi.ResourceSlice, error) {
return l.sliceLister.List(labels.Everything())
}
var _ framework.DeviceClassLister = &deviceClassLister{}
type deviceClassLister struct {
classLister resourcelisters.DeviceClassLister
}
func (l *deviceClassLister) Get(className string) (*resourceapi.DeviceClass, error) {
return l.classLister.Get(className)
}
func (l *deviceClassLister) List() ([]*resourceapi.DeviceClass, error) {
return l.classLister.List(labels.Everything())
}
var _ framework.ResourceClaimTracker = &claimTracker{}
type claimTracker struct {
// cache enables temporarily storing a newer claim object
// while the scheduler has allocated it and the corresponding object
// update from the apiserver has not been processed by the claim
// informer callbacks. ResourceClaimTracker get added here in PreBind and removed by
// the informer callback (based on the "newer than" comparison in the
// assume cache).
//
// It uses cache.MetaNamespaceKeyFunc to generate object names, which
// therefore are "<namespace>/<name>".
//
// This is necessary to ensure that reconstructing the resource usage
// at the start of a pod scheduling cycle doesn't reuse the resources
// assigned to such a claim. Alternatively, claim allocation state
// could also get tracked across pod scheduling cycles, but that
// - adds complexity (need to carefully sync state with informer events
// for claims and ResourceSlices)
// - would make integration with cluster autoscaler harder because it would need
// to trigger informer callbacks.
cache *assumecache.AssumeCache
// inFlightAllocations is a map from claim UUIDs to claim objects for those claims
// for which allocation was triggered during a scheduling cycle and the
// corresponding claim status update call in PreBind has not been done
// yet. If another pod needs the claim, the pod is treated as "not
// schedulable yet". The cluster event for the claim status update will
// make it schedulable.
//
// This mechanism avoids the following problem:
// - Pod A triggers allocation for claim X.
// - Pod B shares access to that claim and gets scheduled because
// the claim is assumed to be allocated.
// - PreBind for pod B is called first, tries to update reservedFor and
// fails because the claim is not really allocated yet.
//
// We could avoid the ordering problem by allowing either pod A or pod B
// to set the allocation. But that is more complicated and leads to another
// problem:
// - Pod A and B get scheduled as above.
// - PreBind for pod A gets called first, then fails with a temporary API error.
// It removes the updated claim from the assume cache because of that.
// - PreBind for pod B gets called next and succeeds with adding the
// allocation and its own reservedFor entry.
// - The assume cache is now not reflecting that the claim is allocated,
// which could lead to reusing the same resource for some other claim.
//
// A sync.Map is used because in practice sharing of a claim between
// pods is expected to be rare compared to per-pod claim, so we end up
// hitting the "multiple goroutines read, write, and overwrite entries
// for disjoint sets of keys" case that sync.Map is optimized for.
inFlightAllocations *sync.Map
allocatedDevices *allocatedDevices
logger klog.Logger
}
func (c *claimTracker) ClaimHasPendingAllocation(claimUID types.UID) bool {
_, found := c.inFlightAllocations.Load(claimUID)
return found
}
func (c *claimTracker) SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error {
c.inFlightAllocations.Store(claimUID, allocatedClaim)
// There's no reason to return an error in this implementation, but the error is helpful for other implementations.
// For example, implementations that have to deal with fake claims might want to return an error if the allocation
// is for an invalid claim.
return nil
}
func (c *claimTracker) RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool) {
_, found := c.inFlightAllocations.LoadAndDelete(claimUID)
return found
}
func (c *claimTracker) Get(namespace, claimName string) (*resourceapi.ResourceClaim, error) {
obj, err := c.cache.Get(namespace + "/" + claimName)
if err != nil {
return nil, err
}
claim, ok := obj.(*resourceapi.ResourceClaim)
if !ok {
return nil, fmt.Errorf("unexpected object type %T for assumed object %s/%s", obj, namespace, claimName)
}
return claim, nil
}
func (c *claimTracker) List() ([]*resourceapi.ResourceClaim, error) {
var result []*resourceapi.ResourceClaim
// Probably not worth adding an index for?
objs := c.cache.List(nil)
for _, obj := range objs {
claim, ok := obj.(*resourceapi.ResourceClaim)
if ok {
result = append(result, claim)
}
}
return result, nil
}
func (c *claimTracker) ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error) {
// Start with a fresh set that matches the current known state of the
// world according to the informers.
allocated := c.allocatedDevices.Get()
// Whatever is in flight also has to be checked.
c.inFlightAllocations.Range(func(key, value any) bool {
claim := value.(*resourceapi.ResourceClaim)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
c.logger.V(6).Info("Device is in flight for allocation", "device", deviceID, "claim", klog.KObj(claim))
allocated.Insert(deviceID)
})
return true
})
// There's no reason to return an error in this implementation, but the error might be helpful for other implementations.
return allocated, nil
}
func (c *claimTracker) AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error {
return c.cache.Assume(claim)
}
func (c *claimTracker) AssumedClaimRestore(namespace, claimName string) {
c.cache.Restore(namespace + "/" + claimName)
}

View File

@ -0,0 +1,905 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"context"
"errors"
"fmt"
"slices"
"sync"
"github.com/google/go-cmp/cmp"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/util/retry"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/dynamic-resource-allocation/cel"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// Name is the name of the plugin used in Registry and configurations.
Name = names.DynamicResources
stateKey framework.StateKey = Name
)
// The state is initialized in PreFilter phase. Because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
// A copy of all claims for the Pod (i.e. 1:1 match with
// pod.Spec.ResourceClaims), initially with the status from the start
// of the scheduling cycle. Each claim instance is read-only because it
// might come from the informer cache. The instances get replaced when
// the plugin itself successfully does an Update.
//
// Empty if the Pod has no claims.
claims []*resourceapi.ResourceClaim
// Allocator handles claims with structured parameters.
allocator *structured.Allocator
// mutex must be locked while accessing any of the fields below.
mutex sync.Mutex
// The indices of all claims that:
// - are allocated
// - use delayed allocation or the builtin controller
// - were not available on at least one node
//
// Set in parallel during Filter, so write access there must be
// protected by the mutex. Used by PostFilter.
unavailableClaims sets.Set[int]
informationsForClaim []informationForClaim
// nodeAllocations caches the result of Filter for the nodes.
nodeAllocations map[string][]resourceapi.AllocationResult
}
func (d *stateData) Clone() framework.StateData {
return d
}
type informationForClaim struct {
// Node selector based on the claim status if allocated.
availableOnNodes *nodeaffinity.NodeSelector
// Set by Reserved, published by PreBind.
allocation *resourceapi.AllocationResult
}
// DynamicResources is a plugin that ensures that ResourceClaims are allocated.
type DynamicResources struct {
enabled bool
enableAdminAccess bool
enableSchedulingQueueHint bool
fh framework.Handle
clientset kubernetes.Interface
celCache *cel.Cache
draManager framework.SharedDRAManager
}
// New initializes a new plugin and returns it.
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
if !fts.EnableDynamicResourceAllocation {
// Disabled, won't do anything.
return &DynamicResources{}, nil
}
pl := &DynamicResources{
enabled: true,
enableAdminAccess: fts.EnableDRAAdminAccess,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
fh: fh,
clientset: fh.ClientSet(),
// This is a LRU cache for compiled CEL expressions. The most
// recent 10 of them get reused across different scheduling
// cycles.
celCache: cel.NewCache(10),
draManager: fh.SharedDRAManager(),
}
return pl, nil
}
var _ framework.PreEnqueuePlugin = &DynamicResources{}
var _ framework.PreFilterPlugin = &DynamicResources{}
var _ framework.FilterPlugin = &DynamicResources{}
var _ framework.PostFilterPlugin = &DynamicResources{}
var _ framework.ReservePlugin = &DynamicResources{}
var _ framework.EnqueueExtensions = &DynamicResources{}
var _ framework.PreBindPlugin = &DynamicResources{}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *DynamicResources) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *DynamicResources) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enabled {
return nil, nil
}
// A resource might depend on node labels for topology filtering.
// A new or updated node may make pods schedulable.
//
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// When QHint is enabled, the problematic preCheck is already removed, and we can remove UpdateNodeTaint.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
events := []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange},
// Adding the ResourceClaim name to the pod status makes pods waiting for their ResourceClaim schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodGeneratedResourceClaim}, QueueingHintFn: pl.isSchedulableAfterPodChange},
// A pod might be waiting for a class to get created or modified.
{Event: framework.ClusterEvent{Resource: framework.DeviceClass, ActionType: framework.Add | framework.Update}},
// Adding or updating a ResourceSlice might make a pod schedulable because new resources became available.
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterResourceSliceChange},
}
return events, nil
}
// PreEnqueue checks if there are known reasons why a pod currently cannot be
// scheduled. When this fails, one of the registered events can trigger another
// attempt.
func (pl *DynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) {
if !pl.enabled {
return nil
}
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
return statusUnschedulable(klog.FromContext(ctx), err.Error())
}
return nil
}
// isSchedulableAfterClaimChange is invoked for add and update claim events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable. It errs on the side of letting a pod scheduling attempt
// happen. The delete claim event will not invoke it, so newObj will never be nil.
func (pl *DynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
}
usesClaim := false
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
if claim.UID == modifiedClaim.UID {
usesClaim = true
}
}); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
if loggerV := logger.V(6); loggerV.Enabled() {
owner := metav1.GetControllerOf(modifiedClaim)
loggerV.Info("pod is not schedulable after resource claim change", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "claimOwner", owner, "reason", err.Error())
}
return framework.QueueSkip, nil
}
if originalClaim != nil &&
originalClaim.Status.Allocation != nil &&
modifiedClaim.Status.Allocation == nil {
// A claim with structured parameters was deallocated. This might have made
// resources available for other pods.
logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
if !usesClaim {
// This was not the claim the pod was waiting for.
logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.QueueSkip, nil
}
if originalClaim == nil {
logger.V(5).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
// Modifications may or may not be relevant. If the entire
// status is as before, then something else must have changed
// and we don't care. What happens in practice is that the
// resource driver adds the finalizer.
if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) {
if loggerV := logger.V(7); loggerV.Enabled() {
// Log more information.
loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim))
} else {
logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
}
return framework.QueueSkip, nil
}
logger.V(5).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
// isSchedulableAfterPodChange is invoked for update pod events reported by
// an informer. It checks whether that change adds the ResourceClaim(s) that the
// pod has been waiting for.
func (pl *DynamicResources) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := schedutil.As[*v1.Pod](nil, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
}
if pod.UID != modifiedPod.UID {
logger.V(7).Info("pod is not schedulable after change in other pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
if err := pl.foreachPodResourceClaim(modifiedPod, nil); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
logger.V(6).Info("pod is not schedulable after being updated", "pod", klog.KObj(pod))
return framework.QueueSkip, nil
}
logger.V(5).Info("pod got updated and is schedulable", "pod", klog.KObj(pod))
return framework.Queue, nil
}
// isSchedulableAfterResourceSliceChange is invoked for add and update slice events reported by
// an informer. Such changes can make an unschedulable pod schedulable when the pod requests a device
// and the change adds a suitable device.
//
// For the sake of faster execution and avoiding code duplication, isSchedulableAfterResourceSliceChange
// only checks whether the pod uses claims. All of the more detailed checks are done in the scheduling
// attempt.
//
// The delete claim event will not invoke it, so newObj will never be nil.
func (pl *DynamicResources) isSchedulableAfterResourceSliceChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedSlice, err := schedutil.As[*resourceapi.ResourceSlice](oldObj, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterResourceSliceChange: %w", err)
}
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
logger.V(6).Info("pod is not schedulable after resource slice change", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice), "reason", err.Error())
return framework.QueueSkip, nil
}
// We could check what got changed in the slice, but right now that's likely to be
// about the spec (there's no status yet...).
// We could check whether all claims use classic DRA, but that doesn't seem worth it.
// Let's assume that changing the slice may make the pod schedulable.
logger.V(5).Info("ResourceSlice change might make pod schedulable", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice))
return framework.Queue, nil
}
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
func (pl *DynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourceapi.ResourceClaim, error) {
claims := make([]*resourceapi.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
// We store the pointer as returned by the lister. The
// assumption is that if a claim gets modified while our code
// runs, the cache will store a new pointer, not mutate the
// existing object that we point to here.
claims = append(claims, claim)
}); err != nil {
return nil, err
}
return claims, nil
}
// foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
// It calls an optional handler for those claims that it finds.
func (pl *DynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourceapi.ResourceClaim)) error {
for _, resource := range pod.Spec.ResourceClaims {
claimName, mustCheckOwner, err := resourceclaim.Name(pod, &resource)
if err != nil {
return err
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
claim, err := pl.draManager.ResourceClaims().Get(pod.Namespace, *claimName)
if err != nil {
return err
}
if claim.DeletionTimestamp != nil {
return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
}
if mustCheckOwner {
if err := resourceclaim.IsForPod(pod, claim); err != nil {
return err
}
}
if cb != nil {
cb(resource.Name, claim)
}
}
return nil
}
// PreFilter invoked at the prefilter extension point to check if pod has all
// immediate claims bound. UnschedulableAndUnresolvable is returned if
// the pod cannot be scheduled at the moment on any node.
func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
if !pl.enabled {
return nil, framework.NewStatus(framework.Skip)
}
logger := klog.FromContext(ctx)
// If the pod does not reference any claim, we don't need to do
// anything for it. We just initialize an empty state to record that
// observation for the other functions. This gets updated below
// if we get that far.
s := &stateData{}
state.Write(stateKey, s)
claims, err := pl.podResourceClaims(pod)
if err != nil {
return nil, statusUnschedulable(logger, err.Error())
}
logger.V(5).Info("pod resource claims", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(claims))
// If the pod does not reference any claim,
// DynamicResources Filter has nothing to do with the Pod.
if len(claims) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
// All claims which the scheduler needs to allocate itself.
allocateClaims := make([]*resourceapi.ResourceClaim, 0, len(claims))
s.informationsForClaim = make([]informationForClaim, len(claims))
for index, claim := range claims {
if claim.Status.Allocation != nil &&
!resourceclaim.CanBeReserved(claim) &&
!resourceclaim.IsReservedForPod(pod, claim) {
// Resource is in use. The pod has to wait.
return nil, statusUnschedulable(logger, "resourceclaim in use", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
}
if claim.Status.Allocation != nil {
if claim.Status.Allocation.NodeSelector != nil {
nodeSelector, err := nodeaffinity.NewNodeSelector(claim.Status.Allocation.NodeSelector)
if err != nil {
return nil, statusError(logger, err)
}
s.informationsForClaim[index].availableOnNodes = nodeSelector
}
} else {
allocateClaims = append(allocateClaims, claim)
// Allocation in flight? Better wait for that
// to finish, see inFlightAllocations
// documentation for details.
if pl.draManager.ResourceClaims().ClaimHasPendingAllocation(claim.UID) {
return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s is in the process of being allocated", klog.KObj(claim)))
}
// Check all requests and device classes. If a class
// does not exist, scheduling cannot proceed, no matter
// how the claim is being allocated.
//
// When using a control plane controller, a class might
// have a node filter. This is useful for trimming the
// initial set of potential nodes before we ask the
// driver(s) for information about the specific pod.
for _, request := range claim.Spec.Devices.Requests {
if request.DeviceClassName == "" {
return nil, statusError(logger, fmt.Errorf("request %s: unsupported request type", request.Name))
}
_, err := pl.draManager.DeviceClasses().Get(request.DeviceClassName)
if err != nil {
// If the class cannot be retrieved, allocation cannot proceed.
if apierrors.IsNotFound(err) {
// Here we mark the pod as "unschedulable", so it'll sleep in
// the unscheduleable queue until a DeviceClass event occurs.
return nil, statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", request.Name, request.DeviceClassName))
}
// Other error, retry with backoff.
return nil, statusError(logger, fmt.Errorf("request %s: look up device class: %w", request.Name, err))
}
}
}
}
if len(allocateClaims) > 0 {
logger.V(5).Info("Preparing allocation with structured parameters", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(allocateClaims))
// Doing this over and over again for each pod could be avoided
// by setting the allocator up once and then keeping it up-to-date
// as changes are observed.
//
// But that would cause problems for using the plugin in the
// Cluster Autoscaler. If this step here turns out to be
// expensive, we may have to maintain and update state more
// persistently.
//
// Claims (and thus their devices) are treated as "allocated" if they are in the assume cache
// or currently their allocation is in-flight. This does not change
// during filtering, so we can determine that once.
allAllocatedDevices, err := pl.draManager.ResourceClaims().ListAllAllocatedDevices()
if err != nil {
return nil, statusError(logger, err)
}
slices, err := pl.draManager.ResourceSlices().List()
if err != nil {
return nil, statusError(logger, err)
}
allocator, err := structured.NewAllocator(ctx, pl.enableAdminAccess, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
if err != nil {
return nil, statusError(logger, err)
}
s.allocator = allocator
s.nodeAllocations = make(map[string][]resourceapi.AllocationResult)
}
s.claims = claims
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *DynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(stateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the resources it requests,
// for both allocated and unallocated claims.
//
// For claims that are bound, then it checks that the node affinity is
// satisfied by the given node.
//
// For claims that are unbound, it checks whether the claim might get allocated
// for the node.
func (pl *DynamicResources) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
var unavailableClaims []int
for index, claim := range state.claims {
logger.V(10).Info("filtering based on resource claims of the pod", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
// This node selector only gets set if the claim is allocated.
if nodeSelector := state.informationsForClaim[index].availableOnNodes; nodeSelector != nil && !nodeSelector.Match(node) {
logger.V(5).Info("allocation's node selector does not match", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
unavailableClaims = append(unavailableClaims, index)
}
}
// Use allocator to check the node and cache the result in case that the node is picked.
var allocations []resourceapi.AllocationResult
if state.allocator != nil {
allocCtx := ctx
if loggerV := logger.V(5); loggerV.Enabled() {
allocCtx = klog.NewContext(allocCtx, klog.LoggerWithValues(logger, "node", klog.KObj(node)))
}
a, err := state.allocator.Allocate(allocCtx, node)
if err != nil {
// This should only fail if there is something wrong with the claim or class.
// Return an error to abort scheduling of it.
//
// This will cause retries. It would be slightly nicer to mark it as unschedulable
// *and* abort scheduling. Then only cluster event for updating the claim or class
// with the broken CEL expression would trigger rescheduling.
//
// But we cannot do both. As this shouldn't occur often, aborting like this is
// better than the more complicated alternative (return Unschedulable here, remember
// the error, then later raise it again later if needed).
return statusError(logger, err, "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
}
// Check for exact length just to be sure. In practice this is all-or-nothing.
if len(a) != len(state.allocator.ClaimsToAllocate()) {
return statusUnschedulable(logger, "cannot allocate all claims", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
}
// Reserve uses this information.
allocations = a
}
// Store information in state while holding the mutex.
if state.allocator != nil || len(unavailableClaims) > 0 {
state.mutex.Lock()
defer state.mutex.Unlock()
}
if len(unavailableClaims) > 0 {
// Remember all unavailable claims. This might be observed
// concurrently, so we have to lock the state before writing.
if state.unavailableClaims == nil {
state.unavailableClaims = sets.New[int]()
}
for _, index := range unavailableClaims {
state.unavailableClaims.Insert(index)
}
return statusUnschedulable(logger, "resourceclaim not available on the node", "pod", klog.KObj(pod))
}
if state.allocator != nil {
state.nodeAllocations[node.Name] = allocations
}
return nil
}
// PostFilter checks whether there are allocated claims that could get
// deallocated to help get the Pod schedulable. If yes, it picks one and
// requests its deallocation. This only gets called when filtering found no
// suitable node.
func (pl *DynamicResources) PostFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, filteredNodeStatusMap framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
if !pl.enabled {
return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
}
logger := klog.FromContext(ctx)
state, err := getStateData(cs)
if err != nil {
return nil, statusError(logger, err)
}
if len(state.claims) == 0 {
return nil, framework.NewStatus(framework.Unschedulable, "no new claims to deallocate")
}
// Iterating over a map is random. This is intentional here, we want to
// pick one claim randomly because there is no better heuristic.
for index := range state.unavailableClaims {
claim := state.claims[index]
if len(claim.Status.ReservedFor) == 0 ||
len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID {
claim := claim.DeepCopy()
claim.Status.ReservedFor = nil
claim.Status.Allocation = nil
logger.V(5).Info("Deallocation of ResourceClaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
if _, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
return nil, statusError(logger, err)
}
return nil, framework.NewStatus(framework.Unschedulable, "deallocation of ResourceClaim completed")
}
}
return nil, framework.NewStatus(framework.Unschedulable, "still not schedulable")
}
// Reserve reserves claims for the pod.
func (pl *DynamicResources) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (status *framework.Status) {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
numClaimsWithAllocator := 0
for _, claim := range state.claims {
if claim.Status.Allocation != nil {
// Allocated, but perhaps not reserved yet. We checked in PreFilter that
// the pod could reserve the claim. Instead of reserving here by
// updating the ResourceClaim status, we assume that reserving
// will work and only do it for real during binding. If it fails at
// that time, some other pod was faster and we have to try again.
continue
}
numClaimsWithAllocator++
}
if numClaimsWithAllocator == 0 {
// Nothing left to do.
return nil
}
// Prepare allocation of claims handled by the schedulder.
if state.allocator != nil {
// Entries in these two slices match each other.
claimsToAllocate := state.allocator.ClaimsToAllocate()
allocations, ok := state.nodeAllocations[nodeName]
if !ok {
// We checked before that the node is suitable. This shouldn't have failed,
// so treat this as an error.
return statusError(logger, errors.New("claim allocation not found for node"))
}
// Sanity check: do we have results for all pending claims?
if len(allocations) != len(claimsToAllocate) ||
len(allocations) != numClaimsWithAllocator {
return statusError(logger, fmt.Errorf("internal error, have %d allocations, %d claims to allocate, want %d claims", len(allocations), len(claimsToAllocate), numClaimsWithAllocator))
}
for i, claim := range claimsToAllocate {
index := slices.Index(state.claims, claim)
if index < 0 {
return statusError(logger, fmt.Errorf("internal error, claim %s with allocation not found", claim.Name))
}
allocation := &allocations[i]
state.informationsForClaim[index].allocation = allocation
// Strictly speaking, we don't need to store the full modified object.
// The allocation would be enough. The full object is useful for
// debugging, testing and the allocator, so let's make it realistic.
claim = claim.DeepCopy()
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
}
claim.Status.Allocation = allocation
err := pl.draManager.ResourceClaims().SignalClaimPendingAllocation(claim.UID, claim)
if err != nil {
return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name))
}
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation))
}
}
return nil
}
// Unreserve clears the ReservedFor field for all claims.
// It's idempotent, and does nothing if no state found for the given pod.
func (pl *DynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
if !pl.enabled {
return
}
state, err := getStateData(cs)
if err != nil {
return
}
if len(state.claims) == 0 {
return
}
logger := klog.FromContext(ctx)
for index, claim := range state.claims {
// If allocation was in-flight, then it's not anymore and we need to revert the
// claim object in the assume cache to what it was before.
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(state.claims[index].UID); deleted {
pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name)
}
if claim.Status.Allocation != nil &&
resourceclaim.IsReservedForPod(pod, claim) {
// Remove pod from ReservedFor. A strategic-merge-patch is used
// because that allows removing an individual entry without having
// the latest slice.
patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": { "reservedFor": [ {"$patch": "delete", "uid": %q} ] }}`,
claim.UID,
pod.UID,
)
logger.V(5).Info("unreserve", "resourceclaim", klog.KObj(claim), "pod", klog.KObj(pod))
claim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
if err != nil {
// We will get here again when pod scheduling is retried.
logger.Error(err, "unreserve", "resourceclaim", klog.KObj(claim))
}
}
}
}
// PreBind gets called in a separate goroutine after it has been determined
// that the pod should get bound to this node. Because Reserve did not actually
// reserve claims, we need to do it now. For claims with the builtin controller,
// we also handle the allocation.
//
// If anything fails, we return an error and
// the pod will have to go into the backoff queue. The scheduler will call
// Unreserve as part of the error handling.
func (pl *DynamicResources) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
for index, claim := range state.claims {
if !resourceclaim.IsReservedForPod(pod, claim) {
claim, err := pl.bindClaim(ctx, state, index, pod, nodeName)
if err != nil {
return statusError(logger, err)
}
state.claims[index] = claim
}
}
// If we get here, we know that reserving the claim for
// the pod worked and we can proceed with binding it.
return nil
}
// bindClaim gets called by PreBind for claim which is not reserved for the pod yet.
// It might not even be allocated. bindClaim then ensures that the allocation
// and reservation are recorded. This finishes the work started in Reserve.
func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourceapi.ResourceClaim, finalErr error) {
logger := klog.FromContext(ctx)
claim := state.claims[index].DeepCopy()
allocation := state.informationsForClaim[index].allocation
defer func() {
if allocation != nil {
// The scheduler was handling allocation. Now that has
// completed, either successfully or with a failure.
if finalErr == nil {
// This can fail, but only for reasons that are okay (concurrent delete or update).
// Shouldn't happen in this case.
if err := pl.draManager.ResourceClaims().AssumeClaimAfterAPICall(claim); err != nil {
logger.V(5).Info("Claim not stored in assume cache", "err", finalErr)
}
}
pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID)
}
}()
logger.V(5).Info("preparing claim status update", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation))
// We may run into a ResourceVersion conflict because there may be some
// benign concurrent changes. In that case we get the latest claim and
// try again.
refreshClaim := false
retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
if refreshClaim {
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Get(ctx, claim.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("get updated claim %s after conflict: %w", klog.KObj(claim), err)
}
logger.V(5).Info("retrying update after conflict", "claim", klog.KObj(claim))
claim = updatedClaim
} else {
// All future retries must get a new claim first.
refreshClaim = true
}
if claim.DeletionTimestamp != nil {
return fmt.Errorf("claim %s got deleted in the meantime", klog.KObj(claim))
}
// Do we need to store an allocation result from Reserve?
if allocation != nil {
if claim.Status.Allocation != nil {
return fmt.Errorf("claim %s got allocated elsewhere in the meantime", klog.KObj(claim))
}
// The finalizer needs to be added in a normal update.
// If we were interrupted in the past, it might already be set and we simply continue.
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("add finalizer to claim %s: %w", klog.KObj(claim), err)
}
claim = updatedClaim
}
claim.Status.Allocation = allocation
}
// We can simply try to add the pod here without checking
// preconditions. The apiserver will tell us with a
// non-conflict error if this isn't possible.
claim.Status.ReservedFor = append(claim.Status.ReservedFor, resourceapi.ResourceClaimConsumerReference{Resource: "pods", Name: pod.Name, UID: pod.UID})
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
if err != nil {
if allocation != nil {
return fmt.Errorf("add allocation and reservation to claim %s: %w", klog.KObj(claim), err)
}
return fmt.Errorf("add reservation to claim %s: %w", klog.KObj(claim), err)
}
claim = updatedClaim
return nil
})
if retryErr != nil {
return nil, retryErr
}
logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim))
return claim, nil
}
// statusUnschedulable ensures that there is a log message associated with the
// line where the status originated.
func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) *framework.Status {
if loggerV := logger.V(5); loggerV.Enabled() {
helper, loggerV := loggerV.WithCallStackHelper()
helper()
kv = append(kv, "reason", reason)
// nolint: logcheck // warns because it cannot check key/values
loggerV.Info("pod unschedulable", kv...)
}
return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason)
}
// statusError ensures that there is a log message associated with the
// line where the error originated.
func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status {
if loggerV := logger.V(5); loggerV.Enabled() {
helper, loggerV := loggerV.WithCallStackHelper()
helper()
// nolint: logcheck // warns because it cannot check key/values
loggerV.Error(err, "dynamic resource plugin failed", kv...)
}
return framework.AsStatus(err)
}

View File

@ -0,0 +1,33 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package feature
// Features carries feature gate values used by various plugins.
// This struct allows us to break the dependency of the plugins on
// the internal k8s features pkg.
type Features struct {
EnableDRAAdminAccess bool
EnableDynamicResourceAllocation bool
EnableVolumeCapacityPriority bool
EnableNodeInclusionPolicyInPodTopologySpread bool
EnableMatchLabelKeysInPodTopologySpread bool
EnableInPlacePodVerticalScaling bool
EnableSidecarContainers bool
EnableSchedulingQueueHint bool
EnableAsyncPreemption bool
EnablePodLevelResources bool
}

View File

@ -0,0 +1,55 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import (
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// DefaultNormalizeScore generates a Normalize Score function that can normalize the
// scores from [0, max(scores)] to [0, maxPriority]. If reverse is set to true, it
// reverses the scores by subtracting it from maxPriority.
// Note: The input scores are always assumed to be non-negative integers.
func DefaultNormalizeScore(maxPriority int64, reverse bool, scores framework.NodeScoreList) *framework.Status {
var maxCount int64
for i := range scores {
if scores[i].Score > maxCount {
maxCount = scores[i].Score
}
}
if maxCount == 0 {
if reverse {
for i := range scores {
scores[i].Score = maxPriority
}
}
return nil
}
for i := range scores {
score := scores[i].Score
score = maxPriority * score / maxCount
if reverse {
score = maxPriority - score
}
scores[i].Score = score
}
return nil
}

View File

@ -0,0 +1,52 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
// FunctionShape represents a collection of FunctionShapePoint.
type FunctionShape []FunctionShapePoint
// FunctionShapePoint represents a shape point.
type FunctionShapePoint struct {
// Utilization is function argument.
Utilization int64
// Score is function value.
Score int64
}
// BuildBrokenLinearFunction creates a function which is built using linear segments. Segments are defined via shape array.
// Shape[i].Utilization slice represents points on "Utilization" axis where different segments meet.
// Shape[i].Score represents function values at meeting points.
//
// function f(p) is defined as:
//
// shape[0].Score for p < shape[0].Utilization
// shape[n-1].Score for p > shape[n-1].Utilization
//
// and linear between points (p < shape[i].Utilization)
func BuildBrokenLinearFunction(shape FunctionShape) func(int64) int64 {
return func(p int64) int64 {
for i := 0; i < len(shape); i++ {
if p <= int64(shape[i].Utilization) {
if i == 0 {
return shape[0].Score
}
return shape[i-1].Score + (shape[i].Score-shape[i-1].Score)*(p-shape[i-1].Utilization)/(shape[i].Utilization-shape[i-1].Utilization)
}
}
return shape[len(shape)-1].Score
}
}

View File

@ -0,0 +1,116 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import (
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
appslisters "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
)
var (
rcKind = v1.SchemeGroupVersion.WithKind("ReplicationController")
rsKind = appsv1.SchemeGroupVersion.WithKind("ReplicaSet")
ssKind = appsv1.SchemeGroupVersion.WithKind("StatefulSet")
)
// DefaultSelector returns a selector deduced from the Services, Replication
// Controllers, Replica Sets, and Stateful Sets matching the given pod.
func DefaultSelector(
pod *v1.Pod,
sl corelisters.ServiceLister,
cl corelisters.ReplicationControllerLister,
rsl appslisters.ReplicaSetLister,
ssl appslisters.StatefulSetLister,
) labels.Selector {
labelSet := make(labels.Set)
// Since services, RCs, RSs and SSs match the pod, they won't have conflicting
// labels. Merging is safe.
if services, err := GetPodServices(sl, pod); err == nil {
for _, service := range services {
labelSet = labels.Merge(labelSet, service.Spec.Selector)
}
}
selector := labelSet.AsSelector()
owner := metav1.GetControllerOfNoCopy(pod)
if owner == nil {
return selector
}
gv, err := schema.ParseGroupVersion(owner.APIVersion)
if err != nil {
return selector
}
gvk := gv.WithKind(owner.Kind)
switch gvk {
case rcKind:
if rc, err := cl.ReplicationControllers(pod.Namespace).Get(owner.Name); err == nil {
labelSet = labels.Merge(labelSet, rc.Spec.Selector)
selector = labelSet.AsSelector()
}
case rsKind:
if rs, err := rsl.ReplicaSets(pod.Namespace).Get(owner.Name); err == nil {
if other, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector); err == nil {
if r, ok := other.Requirements(); ok {
selector = selector.Add(r...)
}
}
}
case ssKind:
if ss, err := ssl.StatefulSets(pod.Namespace).Get(owner.Name); err == nil {
if other, err := metav1.LabelSelectorAsSelector(ss.Spec.Selector); err == nil {
if r, ok := other.Requirements(); ok {
selector = selector.Add(r...)
}
}
}
default:
// Not owned by a supported controller.
}
return selector
}
// GetPodServices gets the services that have the selector that match the labels on the given pod.
func GetPodServices(sl corelisters.ServiceLister, pod *v1.Pod) ([]*v1.Service, error) {
allServices, err := sl.Services(pod.Namespace).List(labels.Everything())
if err != nil {
return nil, err
}
var services []*v1.Service
for i := range allServices {
service := allServices[i]
if service.Spec.Selector == nil {
// services with nil selectors match nothing, not everything.
continue
}
selector := labels.Set(service.Spec.Selector).AsSelectorPreValidated()
if selector.Matches(labels.Set(pod.Labels)) {
services = append(services, service)
}
}
return services, nil
}

View File

@ -0,0 +1,28 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import v1 "k8s.io/api/core/v1"
// DoNotScheduleTaintsFilterFunc returns the filter function that can
// filter out the node taints that reject scheduling Pod on a Node.
func DoNotScheduleTaintsFilterFunc() func(t *v1.Taint) bool {
return func(t *v1.Taint) bool {
// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
}
}

View File

@ -0,0 +1,132 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package imagelocality
import (
"context"
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for
// container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range.
const (
mb int64 = 1024 * 1024
minThreshold int64 = 23 * mb
maxContainerThreshold int64 = 1000 * mb
)
// ImageLocality is a score plugin that favors nodes that already have requested pod container's images.
type ImageLocality struct {
handle framework.Handle
}
var _ framework.ScorePlugin = &ImageLocality{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.ImageLocality
// Name returns name of the plugin. It is used in logs, etc.
func (pl *ImageLocality) Name() string {
return Name
}
// Score invoked at the score extension point.
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
nodeInfos, err := pl.handle.SnapshotSharedLister().NodeInfos().List()
if err != nil {
return 0, framework.AsStatus(err)
}
totalNumNodes := len(nodeInfos)
imageScores := sumImageScores(nodeInfo, pod, totalNumNodes)
score := calculatePriority(imageScores, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
return score, nil
}
// ScoreExtensions of the Score plugin.
func (pl *ImageLocality) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, h framework.Handle) (framework.Plugin, error) {
return &ImageLocality{handle: h}, nil
}
// calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's
// priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores.
func calculatePriority(sumScores int64, numContainers int) int64 {
maxThreshold := maxContainerThreshold * int64(numContainers)
if sumScores < minThreshold {
sumScores = minThreshold
} else if sumScores > maxThreshold {
sumScores = maxThreshold
}
return framework.MaxNodeScore * (sumScores - minThreshold) / (maxThreshold - minThreshold)
}
// sumImageScores returns the sum of image scores of all the containers that are already on the node.
// Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate
// the final score.
func sumImageScores(nodeInfo *framework.NodeInfo, pod *v1.Pod, totalNumNodes int) int64 {
var sum int64
for _, container := range pod.Spec.InitContainers {
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
sum += scaledImageScore(state, totalNumNodes)
}
}
for _, container := range pod.Spec.Containers {
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
sum += scaledImageScore(state, totalNumNodes)
}
}
return sum
}
// scaledImageScore returns an adaptively scaled score for the given state of an image.
// The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to.
// This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or
// a few nodes due to image locality.
func scaledImageScore(imageState *framework.ImageStateSummary, totalNumNodes int) int64 {
spread := float64(imageState.NumNodes) / float64(totalNumNodes)
return int64(float64(imageState.Size) * spread)
}
// normalizedImageName returns the CRI compliant name for a given image.
// TODO: cover the corner cases of missed matches, e.g,
// 1. Using Docker as runtime and docker.io/library/test:tag in pod spec, but only test:tag will present in node status
// 2. Using the implicit registry, i.e., test:tag or library/test:tag in pod spec but only docker.io/library/test:tag
// in node status; note that if users consistently use one registry format, this should not happen.
func normalizedImageName(name string) string {
if strings.LastIndex(name, ":") <= strings.LastIndex(name, "/") {
name = name + ":latest"
}
return name
}

View File

@ -0,0 +1,386 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const (
// preFilterStateKey is the key in CycleState to InterPodAffinity pre-computed data for Filtering.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReasonExistingAntiAffinityRulesNotMatch is used for ExistingPodsAntiAffinityRulesNotMatch predicate error.
ErrReasonExistingAntiAffinityRulesNotMatch = "node(s) didn't satisfy existing pods anti-affinity rules"
// ErrReasonAffinityRulesNotMatch is used for PodAffinityRulesNotMatch predicate error.
ErrReasonAffinityRulesNotMatch = "node(s) didn't match pod affinity rules"
// ErrReasonAntiAffinityRulesNotMatch is used for PodAntiAffinityRulesNotMatch predicate error.
ErrReasonAntiAffinityRulesNotMatch = "node(s) didn't match pod anti-affinity rules"
)
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
// A map of topology pairs to the number of existing pods that has anti-affinity terms that match the "pod".
existingAntiAffinityCounts topologyToMatchedTermCount
// A map of topology pairs to the number of existing pods that match the affinity terms of the "pod".
affinityCounts topologyToMatchedTermCount
// A map of topology pairs to the number of existing pods that match the anti-affinity terms of the "pod".
antiAffinityCounts topologyToMatchedTermCount
// podInfo of the incoming pod.
podInfo *framework.PodInfo
// A copy of the incoming pod's namespace labels.
namespaceLabels labels.Set
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
copy := preFilterState{}
copy.affinityCounts = s.affinityCounts.clone()
copy.antiAffinityCounts = s.antiAffinityCounts.clone()
copy.existingAntiAffinityCounts = s.existingAntiAffinityCounts.clone()
// No need to deep copy the podInfo because it shouldn't change.
copy.podInfo = s.podInfo
copy.namespaceLabels = s.namespaceLabels
return &copy
}
// updateWithPod updates the preFilterState counters with the (anti)affinity matches for the given podInfo.
func (s *preFilterState) updateWithPod(pInfo *framework.PodInfo, node *v1.Node, multiplier int64) {
if s == nil {
return
}
s.existingAntiAffinityCounts.updateWithAntiAffinityTerms(pInfo.RequiredAntiAffinityTerms, s.podInfo.Pod, s.namespaceLabels, node, multiplier)
s.affinityCounts.updateWithAffinityTerms(s.podInfo.RequiredAffinityTerms, pInfo.Pod, node, multiplier)
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the updated pod's namespace labels, hence passing nil for nsLabels.
s.antiAffinityCounts.updateWithAntiAffinityTerms(s.podInfo.RequiredAntiAffinityTerms, pInfo.Pod, nil, node, multiplier)
}
type topologyPair struct {
key string
value string
}
type topologyToMatchedTermCount map[topologyPair]int64
func (m topologyToMatchedTermCount) append(toAppend topologyToMatchedTermCount) {
for pair := range toAppend {
m[pair] += toAppend[pair]
}
}
func (m topologyToMatchedTermCount) clone() topologyToMatchedTermCount {
copy := make(topologyToMatchedTermCount, len(m))
copy.append(m)
return copy
}
func (m topologyToMatchedTermCount) update(node *v1.Node, tk string, value int64) {
if tv, ok := node.Labels[tk]; ok {
pair := topologyPair{key: tk, value: tv}
m[pair] += value
// value could be negative, hence we delete the entry if it is down to zero.
if m[pair] == 0 {
delete(m, pair)
}
}
}
// updates the topologyToMatchedTermCount map with the specified value
// for each affinity term if "targetPod" matches ALL terms.
func (m topologyToMatchedTermCount) updateWithAffinityTerms(
terms []framework.AffinityTerm, pod *v1.Pod, node *v1.Node, value int64) {
if podMatchesAllAffinityTerms(terms, pod) {
for _, t := range terms {
m.update(node, t.TopologyKey, value)
}
}
}
// updates the topologyToMatchedTermCount map with the specified value
// for each anti-affinity term matched the target pod.
func (m topologyToMatchedTermCount) updateWithAntiAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, value int64) {
// Check anti-affinity terms.
for _, t := range terms {
if t.Matches(pod, nsLabels) {
m.update(node, t.TopologyKey, value)
}
}
}
// returns true IFF the given pod matches all the given terms.
func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) bool {
if len(terms) == 0 {
return false
}
for _, t := range terms {
// The incoming pod NamespaceSelector was merged into the Namespaces set, and so
// we are not explicitly passing in namespace labels.
if !t.Matches(pod, nil) {
return false
}
}
return true
}
// calculates the following for each existing pod on each node:
// 1. Whether it has PodAntiAffinity
// 2. Whether any AntiAffinityTerm matches the incoming pod
func (pl *InterPodAffinity) getExistingAntiAffinityCounts(ctx context.Context, pod *v1.Pod, nsLabels labels.Set, nodes []*framework.NodeInfo) topologyToMatchedTermCount {
topoMaps := make([]topologyToMatchedTermCount, len(nodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := nodes[i]
node := nodeInfo.Node()
topoMap := make(topologyToMatchedTermCount)
for _, existingPod := range nodeInfo.PodsWithRequiredAntiAffinity {
topoMap.updateWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
}
if len(topoMap) != 0 {
topoMaps[atomic.AddInt32(&index, 1)] = topoMap
}
}
pl.parallelizer.Until(ctx, len(nodes), processNode, pl.Name())
result := make(topologyToMatchedTermCount)
for i := 0; i <= int(index); i++ {
result.append(topoMaps[i])
}
return result
}
// finds existing Pods that match affinity terms of the incoming pod's (anti)affinity terms.
// It returns a topologyToMatchedTermCount that are checked later by the affinity
// predicate. With this topologyToMatchedTermCount available, the affinity predicate does not
// need to check all the pods in the cluster.
func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Context, podInfo *framework.PodInfo, allNodes []*framework.NodeInfo) (topologyToMatchedTermCount, topologyToMatchedTermCount) {
affinityCounts := make(topologyToMatchedTermCount)
antiAffinityCounts := make(topologyToMatchedTermCount)
if len(podInfo.RequiredAffinityTerms) == 0 && len(podInfo.RequiredAntiAffinityTerms) == 0 {
return affinityCounts, antiAffinityCounts
}
affinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
antiAffinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
affinity := make(topologyToMatchedTermCount)
antiAffinity := make(topologyToMatchedTermCount)
for _, existingPod := range nodeInfo.Pods {
affinity.updateWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
antiAffinity.updateWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
}
if len(affinity) > 0 || len(antiAffinity) > 0 {
k := atomic.AddInt32(&index, 1)
affinityCountsList[k] = affinity
antiAffinityCountsList[k] = antiAffinity
}
}
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for i := 0; i <= int(index); i++ {
affinityCounts.append(affinityCountsList[i])
antiAffinityCounts.append(antiAffinityCountsList[i])
}
return affinityCounts, antiAffinityCounts
}
// PreFilter invoked at the prefilter extension point.
func (pl *InterPodAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
var allNodes []*framework.NodeInfo
var nodesWithRequiredAntiAffinityPods []*framework.NodeInfo
var err error
if allNodes, err = pl.sharedLister.NodeInfos().List(); err != nil {
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos: %w", err))
}
if nodesWithRequiredAntiAffinityPods, err = pl.sharedLister.NodeInfos().HavePodsWithRequiredAntiAffinityList(); err != nil {
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos with pods with affinity: %w", err))
}
s := &preFilterState{}
if s.podInfo, err = framework.NewPodInfo(pod); err != nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("parsing pod: %+v", err))
}
for i := range s.podInfo.RequiredAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAffinityTerms[i]); err != nil {
return nil, framework.AsStatus(err)
}
}
for i := range s.podInfo.RequiredAntiAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAntiAffinityTerms[i]); err != nil {
return nil, framework.AsStatus(err)
}
}
logger := klog.FromContext(ctx)
s.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
s.existingAntiAffinityCounts = pl.getExistingAntiAffinityCounts(ctx, pod, s.namespaceLabels, nodesWithRequiredAntiAffinityPods)
s.affinityCounts, s.antiAffinityCounts = pl.getIncomingAffinityAntiAffinityCounts(ctx, s.podInfo, allNodes)
if len(s.existingAntiAffinityCounts) == 0 && len(s.podInfo.RequiredAffinityTerms) == 0 && len(s.podInfo.RequiredAntiAffinityTerms) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *InterPodAffinity) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// AddPod from pre-computed data in cycleState.
func (pl *InterPodAffinity) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToAdd, nodeInfo.Node(), 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *InterPodAffinity) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToRemove, nodeInfo.Node(), -1)
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to interpodaffinity.state error", c)
}
return s, nil
}
// Checks if scheduling the pod onto this node would break any anti-affinity
// terms indicated by the existing pods.
func satisfyExistingPodsAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
if len(state.existingAntiAffinityCounts) > 0 {
// Iterate over topology pairs to get any of the pods being affected by
// the scheduled pod anti-affinity terms
for topologyKey, topologyValue := range nodeInfo.Node().Labels {
tp := topologyPair{key: topologyKey, value: topologyValue}
if state.existingAntiAffinityCounts[tp] > 0 {
return false
}
}
}
return true
}
// Checks if the node satisfies the incoming pod's anti-affinity rules.
func satisfyPodAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
if len(state.antiAffinityCounts) > 0 {
for _, term := range state.podInfo.RequiredAntiAffinityTerms {
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
if state.antiAffinityCounts[tp] > 0 {
return false
}
}
}
}
return true
}
// Checks if the node satisfies the incoming pod's affinity rules.
func satisfyPodAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
podsExist := true
for _, term := range state.podInfo.RequiredAffinityTerms {
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
if state.affinityCounts[tp] <= 0 {
podsExist = false
}
} else {
// All topology labels must exist on the node.
return false
}
}
if !podsExist {
// This pod may be the first pod in a series that have affinity to themselves. In order
// to not leave such pods in pending state forever, we check that if no other pod
// in the cluster matches the namespace and selector of this pod, the pod matches
// its own terms, and the node has all the requested topologies, then we allow the pod
// to pass the affinity check.
if len(state.affinityCounts) == 0 && podMatchesAllAffinityTerms(state.podInfo.RequiredAffinityTerms, state.podInfo.Pod) {
return true
}
return false
}
return true
}
// Filter invoked at the filter extension point.
// It checks if a pod can be scheduled on the specified node with pod affinity/anti-affinity configuration.
func (pl *InterPodAffinity) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if !satisfyPodAffinity(state, nodeInfo) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonAffinityRulesNotMatch)
}
if !satisfyPodAntiAffinity(state, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonAntiAffinityRulesNotMatch)
}
if !satisfyExistingPodsAntiAffinity(state, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonExistingAntiAffinityRulesNotMatch)
}
return nil
}

View File

@ -0,0 +1,247 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
listersv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.InterPodAffinity
var _ framework.PreFilterPlugin = &InterPodAffinity{}
var _ framework.FilterPlugin = &InterPodAffinity{}
var _ framework.PreScorePlugin = &InterPodAffinity{}
var _ framework.ScorePlugin = &InterPodAffinity{}
var _ framework.EnqueueExtensions = &InterPodAffinity{}
// InterPodAffinity is a plugin that checks inter pod affinity
type InterPodAffinity struct {
parallelizer parallelize.Parallelizer
args config.InterPodAffinityArgs
sharedLister framework.SharedLister
nsLister listersv1.NamespaceLister
enableSchedulingQueueHint bool
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *InterPodAffinity) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a failed Pod
// schedulable
func (pl *InterPodAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeTaint event.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
// All ActionType includes the following events:
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's anti-affinity constraints,
// deleting an existing Pod may make it schedulable.
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
// an unschedulable Pod schedulable.
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
// adding an assigned Pod may make it schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.UpdatePodLabel | framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodChange},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
if h.SnapshotSharedLister() == nil {
return nil, fmt.Errorf("SnapshotSharedlister is nil")
}
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
if err := validation.ValidateInterPodAffinityArgs(nil, &args); err != nil {
return nil, err
}
pl := &InterPodAffinity{
parallelizer: h.Parallelizer(),
args: args,
sharedLister: h.SnapshotSharedLister(),
nsLister: h.SharedInformerFactory().Core().V1().Namespaces().Lister(),
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.InterPodAffinityArgs, error) {
ptr, ok := obj.(*config.InterPodAffinityArgs)
if !ok {
return config.InterPodAffinityArgs{}, fmt.Errorf("want args to be of type InterPodAffinityArgs, got %T", obj)
}
return *ptr, nil
}
// Updates Namespaces with the set of namespaces identified by NamespaceSelector.
// If successful, NamespaceSelector is set to nil.
// The assumption is that the term is for an incoming pod, in which case
// namespaceSelector is either unrolled into Namespaces (and so the selector
// is set to Nothing()) or is Empty(), which means match everything. Therefore,
// there when matching against this term, there is no need to lookup the existing
// pod's namespace labels to match them against term's namespaceSelector explicitly.
func (pl *InterPodAffinity) mergeAffinityTermNamespacesIfNotEmpty(at *framework.AffinityTerm) error {
if at.NamespaceSelector.Empty() {
return nil
}
ns, err := pl.nsLister.List(at.NamespaceSelector)
if err != nil {
return err
}
for _, n := range ns {
at.Namespaces.Insert(n.Name)
}
at.NamespaceSelector = labels.Nothing()
return nil
}
// GetNamespaceLabelsSnapshot returns a snapshot of the labels associated with
// the namespace.
func GetNamespaceLabelsSnapshot(logger klog.Logger, ns string, nsLister listersv1.NamespaceLister) (nsLabels labels.Set) {
podNS, err := nsLister.Get(ns)
if err == nil {
// Create and return snapshot of the labels.
return labels.Merge(podNS.Labels, nil)
}
logger.V(3).Info("getting namespace, assuming empty set of namespace labels", "namespace", ns, "err", err)
return
}
func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if (modifiedPod != nil && modifiedPod.Spec.NodeName == "") || (originalPod != nil && originalPod.Spec.NodeName == "") {
logger.V(5).Info("the added/updated/deleted pod is unscheduled, so it doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
// Pod is updated. Return Queue when the updated pod matching the target pod's affinity or not matching anti-affinity.
// Note that, we don't need to check each affinity individually when the Pod has more than one affinity
// because the current PodAffinity looks for a **single** existing pod that can satisfy **all** the terms of inter-pod affinity of an incoming pod.
if modifiedPod != nil && originalPod != nil {
if !podMatchesAllAffinityTerms(terms, originalPod) && podMatchesAllAffinityTerms(terms, modifiedPod) {
logger.V(5).Info("a scheduled pod was updated to match the target pod's affinity, and the pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
if podMatchesAllAffinityTerms(antiTerms, originalPod) && !podMatchesAllAffinityTerms(antiTerms, modifiedPod) {
logger.V(5).Info("a scheduled pod was updated not to match the target pod's anti affinity, and the pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was updated but it doesn't match the target pod's affinity or does match the target pod's anti-affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is added. Return Queue when the added pod matching the target pod's affinity.
if modifiedPod != nil {
if podMatchesAllAffinityTerms(terms, modifiedPod) {
logger.V(5).Info("a scheduled pod was added and it matches the target pod's affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was added and it doesn't match the target pod's affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is deleted. Return Queue when the deleted pod matching the target pod's anti-affinity.
if !podMatchesAllAffinityTerms(antiTerms, originalPod) {
logger.V(5).Info("a scheduled pod was deleted but it doesn't match the target pod's anti-affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
logger.V(5).Info("a scheduled pod was deleted and it matches the target pod's anti-affinity. The pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
for _, term := range terms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
}
}
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
for _, term := range antiTerms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
}
}
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}

View File

@ -0,0 +1,302 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
"math"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// preScoreStateKey is the key in CycleState to InterPodAffinity pre-computed data for Scoring.
const preScoreStateKey = "PreScore" + Name
type scoreMap map[string]map[string]int64
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
topologyScore scoreMap
podInfo *framework.PodInfo
// A copy of the incoming pod's namespace labels.
namespaceLabels labels.Set
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
func (m scoreMap) processTerm(term *framework.AffinityTerm, weight int32, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
if term.Matches(pod, nsLabels) {
if tpValue, tpValueExist := node.Labels[term.TopologyKey]; tpValueExist {
if m[term.TopologyKey] == nil {
m[term.TopologyKey] = make(map[string]int64)
}
m[term.TopologyKey][tpValue] += int64(weight * multiplier)
}
}
}
func (m scoreMap) processTerms(terms []framework.WeightedAffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
for _, term := range terms {
m.processTerm(&term.AffinityTerm, term.Weight, pod, nsLabels, node, multiplier)
}
}
func (m scoreMap) append(other scoreMap) {
for topology, oScores := range other {
scores := m[topology]
if scores == nil {
m[topology] = oScores
continue
}
for k, v := range oScores {
scores[k] += v
}
}
}
func (pl *InterPodAffinity) processExistingPod(
state *preScoreState,
existingPod *framework.PodInfo,
existingPodNodeInfo *framework.NodeInfo,
incomingPod *v1.Pod,
topoScore scoreMap,
) {
existingPodNode := existingPodNodeInfo.Node()
if len(existingPodNode.Labels) == 0 {
return
}
// For every soft pod affinity term of <pod>, if <existingPod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPods>`s node by the term`s weight.
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
topoScore.processTerms(state.podInfo.PreferredAffinityTerms, existingPod.Pod, nil, existingPodNode, 1)
// For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term,
// decrement <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>`s node by the term`s weight.
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
topoScore.processTerms(state.podInfo.PreferredAntiAffinityTerms, existingPod.Pod, nil, existingPodNode, -1)
// For every hard pod affinity term of <existingPod>, if <pod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the constant <args.hardPodAffinityWeight>
if pl.args.HardPodAffinityWeight > 0 && len(existingPodNode.Labels) != 0 {
for _, t := range existingPod.RequiredAffinityTerms {
topoScore.processTerm(&t, pl.args.HardPodAffinityWeight, incomingPod, state.namespaceLabels, existingPodNode, 1)
}
}
// For every soft pod affinity term of <existingPod>, if <pod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the term's weight.
topoScore.processTerms(existingPod.PreferredAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, 1)
// For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term,
// decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the term's weight.
topoScore.processTerms(existingPod.PreferredAntiAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, -1)
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *InterPodAffinity) PreScore(
pCtx context.Context,
cycleState *framework.CycleState,
pod *v1.Pod,
nodes []*framework.NodeInfo,
) *framework.Status {
if len(nodes) == 0 {
// No nodes to score.
return framework.NewStatus(framework.Skip)
}
if pl.sharedLister == nil {
return framework.NewStatus(framework.Error, "empty shared lister in InterPodAffinity PreScore")
}
affinity := pod.Spec.Affinity
hasPreferredAffinityConstraints := affinity != nil && affinity.PodAffinity != nil && len(affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
hasPreferredAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil && len(affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
hasConstraints := hasPreferredAffinityConstraints || hasPreferredAntiAffinityConstraints
// Optionally ignore calculating preferences of existing pods' affinity rules
// if the incoming pod has no inter-pod affinities.
if pl.args.IgnorePreferredTermsOfExistingPods && !hasConstraints {
return framework.NewStatus(framework.Skip)
}
// Unless the pod being scheduled has preferred affinity terms, we only
// need to process nodes hosting pods with affinity.
var allNodes []*framework.NodeInfo
var err error
if hasConstraints {
allNodes, err = pl.sharedLister.NodeInfos().List()
if err != nil {
return framework.AsStatus(fmt.Errorf("failed to get all nodes from shared lister: %w", err))
}
} else {
allNodes, err = pl.sharedLister.NodeInfos().HavePodsWithAffinityList()
if err != nil {
return framework.AsStatus(fmt.Errorf("failed to get pods with affinity list: %w", err))
}
}
state := &preScoreState{
topologyScore: make(map[string]map[string]int64),
}
if state.podInfo, err = framework.NewPodInfo(pod); err != nil {
// Ideally we never reach here, because errors will be caught by PreFilter
return framework.AsStatus(fmt.Errorf("failed to parse pod: %w", err))
}
for i := range state.podInfo.PreferredAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAffinityTerms[i].AffinityTerm); err != nil {
return framework.AsStatus(fmt.Errorf("updating PreferredAffinityTerms: %w", err))
}
}
for i := range state.podInfo.PreferredAntiAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAntiAffinityTerms[i].AffinityTerm); err != nil {
return framework.AsStatus(fmt.Errorf("updating PreferredAntiAffinityTerms: %w", err))
}
}
logger := klog.FromContext(pCtx)
state.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
topoScores := make([]scoreMap, len(allNodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := allNodes[i]
// Unless the pod being scheduled has preferred affinity terms, we only
// need to process pods with affinity in the node.
podsToProcess := nodeInfo.PodsWithAffinity
if hasConstraints {
// We need to process all the pods.
podsToProcess = nodeInfo.Pods
}
topoScore := make(scoreMap)
for _, existingPod := range podsToProcess {
pl.processExistingPod(state, existingPod, nodeInfo, pod, topoScore)
}
if len(topoScore) > 0 {
topoScores[atomic.AddInt32(&index, 1)] = topoScore
}
}
pl.parallelizer.Until(pCtx, len(allNodes), processNode, pl.Name())
if index == -1 {
return framework.NewStatus(framework.Skip)
}
for i := 0; i <= int(index); i++ {
state.topologyScore.append(topoScores[i])
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to interpodaffinity.preScoreState error", c)
}
return s, nil
}
// Score invoked at the Score extension point.
// The "score" returned in this function is the sum of weights got from cycleState which have its topologyKey matching with the node's labels.
// it is normalized later.
// Note: the returned "score" is positive for pod-affinity, and negative for pod-antiaffinity.
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("failed to get node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)
if err != nil {
return 0, framework.AsStatus(err)
}
var score int64
for tpKey, tpValues := range s.topologyScore {
if v, exist := node.Labels[tpKey]; exist {
score += tpValues[v]
}
}
return score, nil
}
// NormalizeScore normalizes the score for each filteredNode.
func (pl *InterPodAffinity) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
s, err := getPreScoreState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if len(s.topologyScore) == 0 {
return nil
}
var minCount int64 = math.MaxInt64
var maxCount int64 = math.MinInt64
for i := range scores {
score := scores[i].Score
if score > maxCount {
maxCount = score
}
if score < minCount {
minCount = score
}
}
maxMinDiff := maxCount - minCount
for i := range scores {
fScore := float64(0)
if maxMinDiff > 0 {
fScore = float64(framework.MaxNodeScore) * (float64(scores[i].Score-minCount) / float64(maxMinDiff))
}
scores[i].Score = int64(fScore)
}
return nil
}
// ScoreExtensions of the Score plugin.
func (pl *InterPodAffinity) ScoreExtensions() framework.ScoreExtensions {
return pl
}

View File

@ -0,0 +1,39 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package names
const (
PrioritySort = "PrioritySort"
DefaultBinder = "DefaultBinder"
DefaultPreemption = "DefaultPreemption"
DynamicResources = "DynamicResources"
ImageLocality = "ImageLocality"
InterPodAffinity = "InterPodAffinity"
NodeAffinity = "NodeAffinity"
NodeName = "NodeName"
NodePorts = "NodePorts"
NodeResourcesBalancedAllocation = "NodeResourcesBalancedAllocation"
NodeResourcesFit = "NodeResourcesFit"
NodeUnschedulable = "NodeUnschedulable"
NodeVolumeLimits = "NodeVolumeLimits"
PodTopologySpread = "PodTopologySpread"
SchedulingGates = "SchedulingGates"
TaintToleration = "TaintToleration"
VolumeBinding = "VolumeBinding"
VolumeRestrictions = "VolumeRestrictions"
VolumeZone = "VolumeZone"
)

View File

@ -0,0 +1,372 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeaffinity
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodeAffinity is a plugin that checks if a pod node selector matches the node label.
type NodeAffinity struct {
handle framework.Handle
addedNodeSelector *nodeaffinity.NodeSelector
addedPrefSchedTerms *nodeaffinity.PreferredSchedulingTerms
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &NodeAffinity{}
var _ framework.FilterPlugin = &NodeAffinity{}
var _ framework.PreScorePlugin = &NodeAffinity{}
var _ framework.ScorePlugin = &NodeAffinity{}
var _ framework.EnqueueExtensions = &NodeAffinity{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeAffinity
// preScoreStateKey is the key in CycleState to NodeAffinity pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
// preFilterStateKey is the key in CycleState to NodeAffinity pre-compute data for Filtering.
preFilterStateKey = "PreFilter" + Name
// ErrReasonPod is the reason for Pod's node affinity/selector not matching.
ErrReasonPod = "node(s) didn't match Pod's node affinity/selector"
// errReasonEnforced is the reason for added node affinity not matching.
errReasonEnforced = "node(s) didn't match scheduler-enforced node affinity"
// errReasonConflict is the reason for pod's conflicting affinity rules.
errReasonConflict = "pod affinity terms conflict"
)
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeAffinity) Name() string {
return Name
}
type preFilterState struct {
requiredNodeSelectorAndAffinity nodeaffinity.RequiredNodeAffinity
}
// Clone just returns the same state because it is not affected by pod additions or deletions.
func (s *preFilterState) Clone() framework.StateData {
return s
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence we can use UpdateNodeLabel instead of Update.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(modifiedNode) {
logger.V(4).Info("added or modified node didn't match scheduler-enforced node affinity and this event won't make the Pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
isMatched, err := requiredNodeAffinity.Match(modifiedNode)
if err != nil {
return framework.Queue, err
}
if !isMatched {
logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
if originalNode == nil {
logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
// (necessary affinity label was added to the node in this case).
wasMatched, err := requiredNodeAffinity.Match(originalNode)
if err != nil {
return framework.Queue, err
}
if wasMatched {
logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// PreFilter builds and writes cycle state used by Filter.
func (pl *NodeAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
affinity := pod.Spec.Affinity
noNodeAffinity := (affinity == nil ||
affinity.NodeAffinity == nil ||
affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil)
if noNodeAffinity && pl.addedNodeSelector == nil && pod.Spec.NodeSelector == nil {
// NodeAffinity Filter has nothing to do with the Pod.
return nil, framework.NewStatus(framework.Skip)
}
state := &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
cycleState.Write(preFilterStateKey, state)
if noNodeAffinity || len(affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 {
return nil, nil
}
// Check if there is affinity to a specific node and return it.
terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
var nodeNames sets.Set[string]
for _, t := range terms {
var termNodeNames sets.Set[string]
for _, r := range t.MatchFields {
if r.Key == metav1.ObjectNameField && r.Operator == v1.NodeSelectorOpIn {
// The requirements represent ANDed constraints, and so we need to
// find the intersection of nodes.
s := sets.New(r.Values...)
if termNodeNames == nil {
termNodeNames = s
} else {
termNodeNames = termNodeNames.Intersection(s)
}
}
}
if termNodeNames == nil {
// If this term has no node.Name field affinity,
// then all nodes are eligible because the terms are ORed.
return nil, nil
}
nodeNames = nodeNames.Union(termNodeNames)
}
// If nodeNames is not nil, but length is 0, it means each term have conflicting affinity to node.Name;
// therefore, pod will not match any node.
if nodeNames != nil && len(nodeNames) == 0 {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonConflict)
} else if len(nodeNames) > 0 {
return &framework.PreFilterResult{NodeNames: nodeNames}, nil
}
return nil, nil
}
// PreFilterExtensions not necessary for this plugin as state doesn't depend on pod additions or deletions.
func (pl *NodeAffinity) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter checks if the Node matches the Pod .spec.affinity.nodeAffinity and
// the plugin's added affinity.
func (pl *NodeAffinity) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(node) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonEnforced)
}
s, err := getPreFilterState(state)
if err != nil {
// Fallback to calculate requiredNodeSelector and requiredNodeAffinity
// here when PreFilter is disabled.
s = &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
}
// Ignore parsing errors for backwards compatibility.
match, _ := s.requiredNodeSelectorAndAffinity.Match(node)
if !match {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonPod)
}
return nil
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
preferredNodeAffinity *nodeaffinity.PreferredSchedulingTerms
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
if err != nil {
return framework.AsStatus(err)
}
if preferredNodeAffinity == nil && pl.addedPrefSchedTerms == nil {
// NodeAffinity Score has nothing to do with the Pod.
return framework.NewStatus(framework.Skip)
}
state := &preScoreState{
preferredNodeAffinity: preferredNodeAffinity,
}
cycleState.Write(preScoreStateKey, state)
return nil
}
// Score returns the sum of the weights of the terms that match the Node.
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
// default affinity.
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
var count int64
if pl.addedPrefSchedTerms != nil {
count += pl.addedPrefSchedTerms.Score(node)
}
s, err := getPreScoreState(state)
if err != nil {
// Fallback to calculate preferredNodeAffinity here when PreScore is disabled.
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
if err != nil {
return 0, framework.AsStatus(err)
}
s = &preScoreState{
preferredNodeAffinity: preferredNodeAffinity,
}
}
if s.preferredNodeAffinity != nil {
count += s.preferredNodeAffinity.Score(node)
}
return count, nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *NodeAffinity) NormalizeScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
return helper.DefaultNormalizeScore(framework.MaxNodeScore, false, scores)
}
// ScoreExtensions of the Score plugin.
func (pl *NodeAffinity) ScoreExtensions() framework.ScoreExtensions {
return pl
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
pl := &NodeAffinity{
handle: h,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
if args.AddedAffinity != nil {
if ns := args.AddedAffinity.RequiredDuringSchedulingIgnoredDuringExecution; ns != nil {
pl.addedNodeSelector, err = nodeaffinity.NewNodeSelector(ns)
if err != nil {
return nil, fmt.Errorf("parsing addedAffinity.requiredDuringSchedulingIgnoredDuringExecution: %w", err)
}
}
// TODO: parse requiredDuringSchedulingRequiredDuringExecution when it gets added to the API.
if terms := args.AddedAffinity.PreferredDuringSchedulingIgnoredDuringExecution; len(terms) != 0 {
pl.addedPrefSchedTerms, err = nodeaffinity.NewPreferredSchedulingTerms(terms)
if err != nil {
return nil, fmt.Errorf("parsing addedAffinity.preferredDuringSchedulingIgnoredDuringExecution: %w", err)
}
}
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.NodeAffinityArgs, error) {
ptr, ok := obj.(*config.NodeAffinityArgs)
if !ok {
return config.NodeAffinityArgs{}, fmt.Errorf("args are not of type NodeAffinityArgs, got %T", obj)
}
return *ptr, validation.ValidateNodeAffinityArgs(nil, ptr)
}
func getPodPreferredNodeAffinity(pod *v1.Pod) (*nodeaffinity.PreferredSchedulingTerms, error) {
affinity := pod.Spec.Affinity
if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil {
return nodeaffinity.NewPreferredSchedulingTerms(affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution)
}
return nil, nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %v", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("invalid PreFilter state, got type %T", c)
}
return s, nil
}

View File

@ -0,0 +1,89 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodename
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// NodeName is a plugin that checks if a pod spec node name matches the current node.
type NodeName struct {
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &NodeName{}
var _ framework.EnqueueExtensions = &NodeName{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeName
// ErrReason returned when node name doesn't match.
ErrReason = "node(s) didn't match the requested node name"
)
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeName) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
// (the same as Queue)
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
}, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeName) Name() string {
return Name
}
// Filter invoked at the filter extension point.
func (pl *NodeName) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !Fits(pod, nodeInfo) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReason)
}
return nil
}
// Fits actually checks if the pod fits the node.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
return len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == nodeInfo.Node().Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodeName{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,215 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeports
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodePorts is a plugin that checks if a node has free ports for the requested pod ports.
type NodePorts struct {
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &NodePorts{}
var _ framework.FilterPlugin = &NodePorts{}
var _ framework.EnqueueExtensions = &NodePorts{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodePorts
// preFilterStateKey is the key in CycleState to NodePorts pre-computed data.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReason when node ports aren't available.
ErrReason = "node(s) didn't have free ports for the requested pod ports"
)
type preFilterState []*v1.ContainerPort
// Clone the prefilter state.
func (s preFilterState) Clone() framework.StateData {
// The state is not impacted by adding/removing existing pods, hence we don't need to make a deep copy.
return s
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodePorts) Name() string {
return Name
}
// getContainerPorts returns the used host ports of Pods: if 'port' was used, a 'port:true' pair
// will be in the result; but it does not resolve port conflict.
func getContainerPorts(pods ...*v1.Pod) []*v1.ContainerPort {
ports := []*v1.ContainerPort{}
for _, pod := range pods {
for j := range pod.Spec.Containers {
container := &pod.Spec.Containers[j]
for k := range container.Ports {
// Only return ports with a host port specified.
if container.Ports[k].HostPort <= 0 {
continue
}
ports = append(ports, &container.Ports[k])
}
}
}
return ports
}
// PreFilter invoked at the prefilter extension point.
func (pl *NodePorts) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
s := getContainerPorts(pod)
// Skip if a pod has no ports.
if len(s) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, preFilterState(s))
return nil, nil
}
// PreFilterExtensions do not exist for this plugin.
func (pl *NodePorts) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to nodeports.preFilterState error", c)
}
return s, nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodePorts) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because NodeUpdated event never means to have any free ports for the Pod.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// Due to immutable fields `spec.containers[*].ports`, pod update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
// (the same as Queue)
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
}, nil
}
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted. It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, nil)
if err != nil {
return framework.Queue, err
}
// If the deleted pod is unscheduled, it doesn't make the target pod schedulable.
if deletedPod.Spec.NodeName == "" {
logger.V(4).Info("the deleted pod is unscheduled and it doesn't make the target pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
// Get the used host ports of the deleted pod.
usedPorts := make(framework.HostPortInfo)
for _, container := range deletedPod.Spec.Containers {
for _, podPort := range container.Ports {
if podPort.HostPort > 0 {
usedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
}
}
}
// If the deleted pod doesn't use any host ports, it doesn't make the target pod schedulable.
if len(usedPorts) == 0 {
return framework.QueueSkip, nil
}
// Construct a fake NodeInfo that only has the deleted Pod.
// If we can schedule `pod` to this fake node, it means that `pod` and the deleted pod don't have any common port(s).
// So, deleting that pod couldn't make `pod` schedulable.
nodeInfo := framework.NodeInfo{UsedPorts: usedPorts}
if Fits(pod, &nodeInfo) {
logger.V(4).Info("the deleted pod and the target pod don't have any common port(s), returning QueueSkip as deleting this Pod won't make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
logger.V(4).Info("the deleted pod and the target pod have any common port(s), returning Queue as deleting this Pod may make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
// Filter invoked at the filter extension point.
func (pl *NodePorts) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
wantPorts, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
fits := fitsPorts(wantPorts, nodeInfo)
if !fits {
return framework.NewStatus(framework.Unschedulable, ErrReason)
}
return nil
}
// Fits checks if the pod fits the node.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
return fitsPorts(getContainerPorts(pod), nodeInfo)
}
func fitsPorts(wantPorts []*v1.ContainerPort, nodeInfo *framework.NodeInfo) bool {
// try to see whether existingPorts and wantPorts will conflict or not
existingPorts := nodeInfo.UsedPorts
for _, cp := range wantPorts {
if existingPorts.CheckConflict(cp.HostIP, string(cp.Protocol), cp.HostPort) {
return false
}
}
return true
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodePorts{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,173 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// BalancedAllocation is a score plugin that calculates the difference between the cpu and memory fraction
// of capacity, and prioritizes the host based on how close the two metrics are to each other.
type BalancedAllocation struct {
handle framework.Handle
resourceAllocationScorer
}
var _ framework.PreScorePlugin = &BalancedAllocation{}
var _ framework.ScorePlugin = &BalancedAllocation{}
// BalancedAllocationName is the name of the plugin used in the plugin registry and configurations.
const (
BalancedAllocationName = names.NodeResourcesBalancedAllocation
// balancedAllocationPreScoreStateKey is the key in CycleState to NodeResourcesBalancedAllocation pre-computed data for Scoring.
balancedAllocationPreScoreStateKey = "PreScore" + BalancedAllocationName
)
// balancedAllocationPreScoreState computed at PreScore and used at Score.
type balancedAllocationPreScoreState struct {
// podRequests have the same order of the resources defined in NodeResourcesFitArgs.Resources,
// same for other place we store a list like that.
podRequests []int64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *balancedAllocationPreScoreState) Clone() framework.StateData {
return s
}
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
state := &balancedAllocationPreScoreState{
podRequests: ba.calculatePodResourceRequestList(pod, ba.resources),
}
cycleState.Write(balancedAllocationPreScoreStateKey, state)
return nil
}
func getBalancedAllocationPreScoreState(cycleState *framework.CycleState) (*balancedAllocationPreScoreState, error) {
c, err := cycleState.Read(balancedAllocationPreScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", balancedAllocationPreScoreStateKey, err)
}
s, ok := c.(*balancedAllocationPreScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (ba *BalancedAllocation) Name() string {
return BalancedAllocationName
}
// Score invoked at the score extension point.
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := ba.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
s, err := getBalancedAllocationPreScoreState(state)
if err != nil {
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
}
// ba.score favors nodes with balanced resource usage rate.
// It calculates the standard deviation for those resources and prioritizes the node based on how close the usage of those resources is to each other.
// Detail: score = (1 - std) * MaxNodeScore, where std is calculated by the root square of Σ((fraction(i)-mean)^2)/len(resources)
// The algorithm is partly inspired by:
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
return ba.score(ctx, pod, nodeInfo, s.podRequests)
}
// ScoreExtensions of the Score plugin.
func (ba *BalancedAllocation) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// NewBalancedAllocation initializes a new plugin and returns it.
func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := baArgs.(*config.NodeResourcesBalancedAllocationArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type NodeResourcesBalancedAllocationArgs, got %T", baArgs)
}
if err := validation.ValidateNodeResourcesBalancedAllocationArgs(nil, args); err != nil {
return nil, err
}
return &BalancedAllocation{
handle: h,
resourceAllocationScorer: resourceAllocationScorer{
Name: BalancedAllocationName,
scorer: balancedResourceScorer,
useRequested: true,
resources: args.Resources,
},
}, nil
}
func balancedResourceScorer(requested, allocable []int64) int64 {
var resourceToFractions []float64
var totalFraction float64
for i := range requested {
if allocable[i] == 0 {
continue
}
fraction := float64(requested[i]) / float64(allocable[i])
if fraction > 1 {
fraction = 1
}
totalFraction += fraction
resourceToFractions = append(resourceToFractions, fraction)
}
std := 0.0
// For most cases, resources are limited to cpu and memory, the std could be simplified to std := (fraction1-fraction2)/2
// len(fractions) > 2: calculate std based on the well-known formula - root square of Σ((fraction(i)-mean)^2)/len(fractions)
// Otherwise, set the std to zero is enough.
if len(resourceToFractions) == 2 {
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
} else if len(resourceToFractions) > 2 {
mean := totalFraction / float64(len(resourceToFractions))
var sum float64
for _, fraction := range resourceToFractions {
sum = sum + (fraction-mean)*(fraction-mean)
}
std = math.Sqrt(sum / float64(len(resourceToFractions)))
}
// STD (standard deviation) is always a positive value. 1-deviation lets the score to be higher for node which has least deviation and
// multiplying it with `MaxNodeScore` provides the scaling factor needed.
return int64((1 - std) * float64(framework.MaxNodeScore))
}

View File

@ -0,0 +1,596 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
"fmt"
"strings"
"github.com/google/go-cmp/cmp"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/resource"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
var _ framework.PreFilterPlugin = &Fit{}
var _ framework.FilterPlugin = &Fit{}
var _ framework.EnqueueExtensions = &Fit{}
var _ framework.PreScorePlugin = &Fit{}
var _ framework.ScorePlugin = &Fit{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeResourcesFit
// preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// preScoreStateKey is the key in CycleState to NodeResourcesFit pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
)
// nodeResourceStrategyTypeMap maps strategy to scorer implementation
var nodeResourceStrategyTypeMap = map[config.ScoringStrategyType]scorer{
config.LeastAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.LeastAllocated),
scorer: leastResourceScorer(resources),
resources: resources,
}
},
config.MostAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.MostAllocated),
scorer: mostResourceScorer(resources),
resources: resources,
}
},
config.RequestedToCapacityRatio: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.RequestedToCapacityRatio),
scorer: requestedToCapacityRatioScorer(resources, args.ScoringStrategy.RequestedToCapacityRatio.Shape),
resources: resources,
}
},
}
// Fit is a plugin that checks if a node has sufficient resources.
type Fit struct {
ignoredResources sets.Set[string]
ignoredResourceGroups sets.Set[string]
enableInPlacePodVerticalScaling bool
enableSidecarContainers bool
enableSchedulingQueueHint bool
enablePodLevelResources bool
handle framework.Handle
resourceAllocationScorer
}
// ScoreExtensions of the Score plugin.
func (f *Fit) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
framework.Resource
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
return s
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
// podRequests have the same order as the resources defined in NodeResourcesBalancedAllocationArgs.Resources,
// same for other place we store a list like that.
podRequests []int64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
func (f *Fit) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
state := &preScoreState{
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (f *Fit) Name() string {
return Name
}
// NewFit initializes a new plugin and returns it.
func NewFit(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := plArgs.(*config.NodeResourcesFitArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", plArgs)
}
if err := validation.ValidateNodeResourcesFitArgs(nil, args); err != nil {
return nil, err
}
if args.ScoringStrategy == nil {
return nil, fmt.Errorf("scoring strategy not specified")
}
strategy := args.ScoringStrategy.Type
scorePlugin, exists := nodeResourceStrategyTypeMap[strategy]
if !exists {
return nil, fmt.Errorf("scoring strategy %s is not supported", strategy)
}
return &Fit{
ignoredResources: sets.New(args.IgnoredResources...),
ignoredResourceGroups: sets.New(args.IgnoredResourceGroups...),
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
enableSidecarContainers: fts.EnableSidecarContainers,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
handle: h,
enablePodLevelResources: fts.EnablePodLevelResources,
resourceAllocationScorer: *scorePlugin(args),
}, nil
}
type ResourceRequestsOptions struct {
EnablePodLevelResources bool
}
// computePodResourceRequest returns a framework.Resource that covers the largest
// width in each resource dimension. Because init-containers run sequentially, we collect
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
// regular containers since they run simultaneously.
//
// # The resources defined for Overhead should be added to the calculated Resource request sum
//
// Example:
//
// Pod:
//
// InitContainers
// IC1:
// CPU: 2
// Memory: 1G
// IC2:
// CPU: 2
// Memory: 3G
// Containers
// C1:
// CPU: 2
// Memory: 1G
// C2:
// CPU: 1
// Memory: 1G
//
// Result: CPU: 3, Memory: 3G
// TODO(ndixita): modify computePodResourceRequest to accept opts of type
// ResourceRequestOptions as the second parameter.
func computePodResourceRequest(pod *v1.Pod, opts ResourceRequestsOptions) *preFilterState {
// pod hasn't scheduled yet so we don't need to worry about InPlacePodVerticalScalingEnabled
reqs := resource.PodRequests(pod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !opts.EnablePodLevelResources,
})
result := &preFilterState{}
result.SetMaxResource(reqs)
return result
}
// PreFilter invoked at the prefilter extension point.
func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
if !f.enableSidecarContainers && hasRestartableInitContainer(pod) {
// Scheduler will calculate resources usage for a Pod containing
// restartable init containers that will be equal or more than kubelet will
// require to run the Pod. So there will be no overbooking. However, to
// avoid the inconsistency in resource calculation between the scheduler
// and the older (before v1.28) kubelet, make the Pod unschedulable.
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "Pod has a restartable init container and the SidecarContainers feature is disabled")
}
cycleState.Write(preFilterStateKey, computePodResourceRequest(pod, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}))
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (f *Fit) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to NodeResourcesFit.preFilterState error", c)
}
return s, nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (f *Fit) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
podActionType := framework.Delete
if f.enableInPlacePodVerticalScaling {
// If InPlacePodVerticalScaling (KEP 1287) is enabled, then UpdatePodScaleDown event should be registered
// for this plugin since a Pod update may free up resources that make other Pods schedulable.
podActionType |= framework.UpdatePodScaleDown
}
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeAllocatable because the only resource update could change the node resource fit plugin's result.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeAllocatable | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if f.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add | framework.UpdateNodeAllocatable
}
return []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: f.isSchedulableAfterPodEvent},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: f.isSchedulableAfterNodeChange},
}, nil
}
// isSchedulableAfterPodEvent is invoked whenever a pod deleted or scaled down. It checks whether
// that change made a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterPodEvent(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := schedutil.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if modifiedPod == nil {
if originalPod.Spec.NodeName == "" {
logger.V(5).Info("the deleted pod was unscheduled and it wouldn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
// any deletion event to a scheduled pod could make the unscheduled pod schedulable.
logger.V(5).Info("another scheduled pod was deleted, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.Queue, nil
}
if !f.enableInPlacePodVerticalScaling {
// If InPlacePodVerticalScaling (KEP 1287) is disabled, the pod scale down event cannot free up any resources.
logger.V(5).Info("another pod was modified, but InPlacePodVerticalScaling is disabled, so it doesn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
if !f.isSchedulableAfterPodScaleDown(pod, originalPod, modifiedPod) {
if loggerV := logger.V(10); loggerV.Enabled() {
// Log more information.
loggerV.Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod), "diff", cmp.Diff(originalPod, modifiedPod))
} else {
logger.V(5).Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
}
return framework.QueueSkip, nil
}
logger.V(5).Info("another scheduled pod or the target pod itself got scaled down, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
// isSchedulableAfterPodScaleDown checks whether the scale down event may make the target pod schedulable. Specifically:
// - Returns true when the update event is for the target pod itself.
// - Returns true when the update event shows a scheduled pod's resource request that the target pod also requests got reduced.
func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod *v1.Pod) bool {
if modifiedPod.UID == targetPod.UID {
// If the scaling down event is for targetPod, it would make targetPod schedulable.
return true
}
if modifiedPod.Spec.NodeName == "" {
// If the update event is for a unscheduled Pod,
// it wouldn't make targetPod schedulable.
return false
}
// the other pod was scheduled, so modification or deletion may free up some resources.
originalMaxResourceReq, modifiedMaxResourceReq := &framework.Resource{}, &framework.Resource{}
originalMaxResourceReq.SetMaxResource(resource.PodRequests(originalPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
modifiedMaxResourceReq.SetMaxResource(resource.PodRequests(modifiedPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
// check whether the resource request of the modified pod is less than the original pod.
podRequests := resource.PodRequests(targetPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling})
for rName, rValue := range podRequests {
if rValue.IsZero() {
// We only care about the resources requested by the pod we are trying to schedule.
continue
}
switch rName {
case v1.ResourceCPU:
if originalMaxResourceReq.MilliCPU > modifiedMaxResourceReq.MilliCPU {
return true
}
case v1.ResourceMemory:
if originalMaxResourceReq.Memory > modifiedMaxResourceReq.Memory {
return true
}
case v1.ResourceEphemeralStorage:
if originalMaxResourceReq.EphemeralStorage > modifiedMaxResourceReq.EphemeralStorage {
return true
}
default:
if schedutil.IsScalarResourceName(rName) && originalMaxResourceReq.ScalarResources[rName] > modifiedMaxResourceReq.ScalarResources[rName] {
return true
}
}
}
return false
}
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
// that change could make a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// Leaving in the queue, since the pod won't fit into the modified node anyway.
if !isFit(pod, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// The pod will fit, so since it's add, unblock scheduling.
if originalNode == nil {
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node, opts ResourceRequestsOptions) bool {
podRequest := computePodResourceRequest(pod, opts)
originalNodeInfo := framework.NewNodeInfo()
originalNodeInfo.SetNode(originalNode)
modifiedNodeInfo := framework.NewNodeInfo()
modifiedNodeInfo.SetNode(modifiedNode)
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
return true
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return false
}
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
return true
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
return true
}
}
return false
}
// isFit checks if the pod fits the node. If the node is nil, it returns false.
// It constructs a fake NodeInfo object for the node and checks if the pod fits the node.
func isFit(pod *v1.Pod, node *v1.Node, opts ResourceRequestsOptions) bool {
if node == nil {
return false
}
nodeInfo := framework.NewNodeInfo()
nodeInfo.SetNode(node)
return len(Fits(pod, nodeInfo, opts)) == 0
}
// Filter invoked at the filter extension point.
// Checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
// It returns a list of insufficient resources, if empty, then the node has all the resources requested by the pod.
func (f *Fit) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
insufficientResources := fitsRequest(s, nodeInfo, f.ignoredResources, f.ignoredResourceGroups)
if len(insufficientResources) != 0 {
// We will keep all failure reasons.
failureReasons := make([]string, 0, len(insufficientResources))
for i := range insufficientResources {
failureReasons = append(failureReasons, insufficientResources[i].Reason)
}
return framework.NewStatus(framework.Unschedulable, failureReasons...)
}
return nil
}
func hasRestartableInitContainer(pod *v1.Pod) bool {
for _, c := range pod.Spec.InitContainers {
if c.RestartPolicy != nil && *c.RestartPolicy == v1.ContainerRestartPolicyAlways {
return true
}
}
return false
}
// InsufficientResource describes what kind of resource limit is hit and caused the pod to not fit the node.
type InsufficientResource struct {
ResourceName v1.ResourceName
// We explicitly have a parameter for reason to avoid formatting a message on the fly
// for common resources, which is expensive for cluster autoscaler simulations.
Reason string
Requested int64
Used int64
Capacity int64
}
// Fits checks if node have enough resources to host the pod.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo, opts ResourceRequestsOptions) []InsufficientResource {
return fitsRequest(computePodResourceRequest(pod, opts), nodeInfo, nil, nil)
}
func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo, ignoredExtendedResources, ignoredResourceGroups sets.Set[string]) []InsufficientResource {
insufficientResources := make([]InsufficientResource, 0, 4)
allowedPodNumber := nodeInfo.Allocatable.AllowedPodNumber
if len(nodeInfo.Pods)+1 > allowedPodNumber {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourcePods,
Reason: "Too many pods",
Requested: 1,
Used: int64(len(nodeInfo.Pods)),
Capacity: int64(allowedPodNumber),
})
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return insufficientResources
}
if podRequest.MilliCPU > 0 && podRequest.MilliCPU > (nodeInfo.Allocatable.MilliCPU-nodeInfo.Requested.MilliCPU) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceCPU,
Reason: "Insufficient cpu",
Requested: podRequest.MilliCPU,
Used: nodeInfo.Requested.MilliCPU,
Capacity: nodeInfo.Allocatable.MilliCPU,
})
}
if podRequest.Memory > 0 && podRequest.Memory > (nodeInfo.Allocatable.Memory-nodeInfo.Requested.Memory) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceMemory,
Reason: "Insufficient memory",
Requested: podRequest.Memory,
Used: nodeInfo.Requested.Memory,
Capacity: nodeInfo.Allocatable.Memory,
})
}
if podRequest.EphemeralStorage > 0 &&
podRequest.EphemeralStorage > (nodeInfo.Allocatable.EphemeralStorage-nodeInfo.Requested.EphemeralStorage) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceEphemeralStorage,
Reason: "Insufficient ephemeral-storage",
Requested: podRequest.EphemeralStorage,
Used: nodeInfo.Requested.EphemeralStorage,
Capacity: nodeInfo.Allocatable.EphemeralStorage,
})
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if v1helper.IsExtendedResourceName(rName) {
// If this resource is one of the extended resources that should be ignored, we will skip checking it.
// rName is guaranteed to have a slash due to API validation.
var rNamePrefix string
if ignoredResourceGroups.Len() > 0 {
rNamePrefix = strings.Split(string(rName), "/")[0]
}
if ignoredExtendedResources.Has(string(rName)) || ignoredResourceGroups.Has(rNamePrefix) {
continue
}
}
if rQuant > (nodeInfo.Allocatable.ScalarResources[rName] - nodeInfo.Requested.ScalarResources[rName]) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: rName,
Reason: fmt.Sprintf("Insufficient %v", rName),
Requested: podRequest.ScalarResources[rName],
Used: nodeInfo.Requested.ScalarResources[rName],
Capacity: nodeInfo.Allocatable.ScalarResources[rName],
})
}
}
return insufficientResources
}
// Score invoked at the Score extension point.
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := f.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
s, err := getPreScoreState(state)
if err != nil {
s = &preScoreState{
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
}
}
return f.score(ctx, pod, nodeInfo, s.podRequests)
}

View File

@ -0,0 +1,61 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// leastResourceScorer favors nodes with fewer requested resources.
// It calculates the percentage of memory, CPU and other resources requested by pods scheduled on the node, and
// prioritizes based on the minimum of the average of the fraction of requested to capacity.
//
// Details:
// (cpu((capacity-requested)*MaxNodeScore*cpuWeight/capacity) + memory((capacity-requested)*MaxNodeScore*memoryWeight/capacity) + ...)/weightSum
func leastResourceScorer(resources []config.ResourceSpec) func([]int64, []int64) int64 {
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := leastRequestedScore(requested[i], allocable[i])
nodeScore += resourceScore * weight
weightSum += weight
}
if weightSum == 0 {
return 0
}
return nodeScore / weightSum
}
}
// The unused capacity is calculated on a scale of 0-MaxNodeScore
// 0 being the lowest priority and `MaxNodeScore` being the highest.
// The more unused resources the higher the score is.
func leastRequestedScore(requested, capacity int64) int64 {
if capacity == 0 {
return 0
}
if requested > capacity {
return 0
}
return ((capacity - requested) * framework.MaxNodeScore) / capacity
}

View File

@ -0,0 +1,65 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// mostResourceScorer favors nodes with most requested resources.
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
// based on the maximum of the average of the fraction of requested to capacity.
//
// Details:
// (cpu(MaxNodeScore * requested * cpuWeight / capacity) + memory(MaxNodeScore * requested * memoryWeight / capacity) + ...) / weightSum
func mostResourceScorer(resources []config.ResourceSpec) func(requested, allocable []int64) int64 {
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := mostRequestedScore(requested[i], allocable[i])
nodeScore += resourceScore * weight
weightSum += weight
}
if weightSum == 0 {
return 0
}
return nodeScore / weightSum
}
}
// The used capacity is calculated on a scale of 0-MaxNodeScore (MaxNodeScore is
// constant with value set to 100).
// 0 being the lowest priority and 100 being the highest.
// The more resources are used the higher the score is. This function
// is almost a reversed version of noderesources.leastRequestedScore.
func mostRequestedScore(requested, capacity int64) int64 {
if capacity == 0 {
return 0
}
if requested > capacity {
// `requested` might be greater than `capacity` because pods with no
// requests get minimum values.
requested = capacity
}
return (requested * framework.MaxNodeScore) / capacity
}

View File

@ -0,0 +1,73 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"math"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
)
const maxUtilization = 100
// buildRequestedToCapacityRatioScorerFunction allows users to apply bin packing
// on core resources like CPU, Memory as well as extended resources like accelerators.
func buildRequestedToCapacityRatioScorerFunction(scoringFunctionShape helper.FunctionShape, resources []config.ResourceSpec) func([]int64, []int64) int64 {
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
resourceScoringFunction := func(requested, capacity int64) int64 {
if capacity == 0 || requested > capacity {
return rawScoringFunction(maxUtilization)
}
return rawScoringFunction(requested * maxUtilization / capacity)
}
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := resourceScoringFunction(requested[i], allocable[i])
if resourceScore > 0 {
nodeScore += resourceScore * weight
weightSum += weight
}
}
if weightSum == 0 {
return 0
}
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
}
}
func requestedToCapacityRatioScorer(resources []config.ResourceSpec, shape []config.UtilizationShapePoint) func([]int64, []int64) int64 {
shapes := make([]helper.FunctionShapePoint, 0, len(shape))
for _, point := range shape {
shapes = append(shapes, helper.FunctionShapePoint{
Utilization: int64(point.Utilization),
// MaxCustomPriorityScore may diverge from the max score used in the scheduler and defined by MaxNodeScore,
// therefore we need to scale the score returned by requested to capacity ratio to the score range
// used by the scheduler.
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
})
}
return buildRequestedToCapacityRatioScorerFunction(shapes, resources)
}

View File

@ -0,0 +1,148 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
resourcehelper "k8s.io/component-helpers/resource"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
// scorer is decorator for resourceAllocationScorer
type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
// resourceAllocationScorer contains information to calculate resource allocation score.
type resourceAllocationScorer struct {
Name string
// used to decide whether to use Requested or NonZeroRequested for
// cpu and memory.
useRequested bool
scorer func(requested, allocable []int64) int64
resources []config.ResourceSpec
}
// score will use `scorer` function to calculate the score.
func (r *resourceAllocationScorer) score(
ctx context.Context,
pod *v1.Pod,
nodeInfo *framework.NodeInfo,
podRequests []int64) (int64, *framework.Status) {
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
// resources not set, nothing scheduled,
if len(r.resources) == 0 {
return 0, framework.NewStatus(framework.Error, "resources not found")
}
requested := make([]int64, len(r.resources))
allocatable := make([]int64, len(r.resources))
for i := range r.resources {
alloc, req := r.calculateResourceAllocatableRequest(logger, nodeInfo, v1.ResourceName(r.resources[i].Name), podRequests[i])
// Only fill the extended resource entry when it's non-zero.
if alloc == 0 {
continue
}
allocatable[i] = alloc
requested[i] = req
}
score := r.scorer(requested, allocatable)
if loggerV := logger.V(10); loggerV.Enabled() { // Serializing these maps is costly.
loggerV.Info("Listed internal info for allocatable resources, requested resources and score", "pod",
klog.KObj(pod), "node", klog.KObj(node), "resourceAllocationScorer", r.Name,
"allocatableResource", allocatable, "requestedResource", requested, "resourceScore", score,
)
}
return score, nil
}
// calculateResourceAllocatableRequest returns 2 parameters:
// - 1st param: quantity of allocatable resource on the node.
// - 2nd param: aggregated quantity of requested resource on the node.
// Note: if it's an extended resource, and the pod doesn't request it, (0, 0) is returned.
func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(logger klog.Logger, nodeInfo *framework.NodeInfo, resource v1.ResourceName, podRequest int64) (int64, int64) {
requested := nodeInfo.NonZeroRequested
if r.useRequested {
requested = nodeInfo.Requested
}
// If it's an extended resource, and the pod doesn't request it. We return (0, 0)
// as an implication to bypass scoring on this resource.
if podRequest == 0 && schedutil.IsScalarResourceName(resource) {
return 0, 0
}
switch resource {
case v1.ResourceCPU:
return nodeInfo.Allocatable.MilliCPU, (requested.MilliCPU + podRequest)
case v1.ResourceMemory:
return nodeInfo.Allocatable.Memory, (requested.Memory + podRequest)
case v1.ResourceEphemeralStorage:
return nodeInfo.Allocatable.EphemeralStorage, (nodeInfo.Requested.EphemeralStorage + podRequest)
default:
if _, exists := nodeInfo.Allocatable.ScalarResources[resource]; exists {
return nodeInfo.Allocatable.ScalarResources[resource], (nodeInfo.Requested.ScalarResources[resource] + podRequest)
}
}
logger.V(10).Info("Requested resource is omitted for node score calculation", "resourceName", resource)
return 0, 0
}
// calculatePodResourceRequest returns the total non-zero requests. If Overhead is defined for the pod
// the Overhead is added to the result.
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
opts := resourcehelper.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
}
if !r.useRequested {
opts.NonMissingContainerRequests = v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(schedutil.DefaultMilliCPURequest, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(schedutil.DefaultMemoryRequest, resource.DecimalSI),
}
}
requests := resourcehelper.PodRequests(pod, opts)
quantity := requests[resourceName]
if resourceName == v1.ResourceCPU {
return quantity.MilliValue()
}
return quantity.Value()
}
func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, resources []config.ResourceSpec) []int64 {
podRequests := make([]int64, len(resources))
for i := range resources {
podRequests[i] = r.calculatePodResourceRequest(pod, v1.ResourceName(resources[i].Name))
}
return podRequests
}

View File

@ -0,0 +1,57 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"github.com/google/go-cmp/cmp/cmpopts"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
)
var (
ignoreBadValueDetail = cmpopts.IgnoreFields(field.Error{}, "BadValue", "Detail")
defaultResources = []config.ResourceSpec{
{Name: string(v1.ResourceCPU), Weight: 1},
{Name: string(v1.ResourceMemory), Weight: 1},
}
extendedRes = "abc.com/xyz"
extendedResourceSet = []config.ResourceSpec{
{Name: string(v1.ResourceCPU), Weight: 1},
{Name: string(v1.ResourceMemory), Weight: 1},
{Name: extendedRes, Weight: 1},
}
)
func makeNode(node string, milliCPU, memory int64, extendedResource map[string]int64) *v1.Node {
resourceList := make(map[v1.ResourceName]resource.Quantity)
for res, quantity := range extendedResource {
resourceList[v1.ResourceName(res)] = *resource.NewQuantity(quantity, resource.DecimalSI)
}
resourceList[v1.ResourceCPU] = *resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
resourceList[v1.ResourceMemory] = *resource.NewQuantity(memory, resource.BinarySI)
return &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: node},
Status: v1.NodeStatus{
Capacity: resourceList,
Allocatable: resourceList,
},
}
}

View File

@ -0,0 +1,154 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeunschedulable
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodeUnschedulable plugin filters nodes that set node.Spec.Unschedulable=true unless
// the pod tolerates {key=node.kubernetes.io/unschedulable, effect:NoSchedule} taint.
type NodeUnschedulable struct {
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &NodeUnschedulable{}
var _ framework.EnqueueExtensions = &NodeUnschedulable{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.NodeUnschedulable
const (
// ErrReasonUnknownCondition is used for NodeUnknownCondition predicate error.
ErrReasonUnknownCondition = "node(s) had unknown conditions"
// ErrReasonUnschedulable is used for NodeUnschedulable predicate error.
ErrReasonUnschedulable = "node(s) were unschedulable"
)
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enableSchedulingQueueHint {
return []framework.ClusterEventWithHint{
// A note about UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
return []framework.ClusterEventWithHint{
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pod.UID == modifiedPod.UID {
// Note: we don't need to check oldPod tolerations the taint because:
// - Taint can be added, but can't be modified nor removed.
// - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place.
// Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet.
if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{
Key: v1.TaintNodeUnschedulable,
Effect: v1.TaintEffectNoSchedule,
}) {
// This update makes the pod tolerate the unschedulable taint.
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// isSchedulableAfterNodeChange is invoked for all node events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable.
func (pl *NodeUnschedulable) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// We queue this Pod when -
// 1. the node is updated from unschedulable to schedulable.
// 2. the node is added and is schedulable.
if (originalNode != nil && originalNode.Spec.Unschedulable && !modifiedNode.Spec.Unschedulable) ||
(originalNode == nil && !modifiedNode.Spec.Unschedulable) {
logger.V(5).Info("node was created or updated, pod may be schedulable now", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
logger.V(5).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeUnschedulable) Name() string {
return Name
}
// Filter invoked at the filter extension point.
func (pl *NodeUnschedulable) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
if !node.Spec.Unschedulable {
return nil
}
// If pod tolerate unschedulable taint, it's also tolerate `node.Spec.Unschedulable`.
podToleratesUnschedulable := v1helper.TolerationsTolerateTaint(pod.Spec.Tolerations, &v1.Taint{
Key: v1.TaintNodeUnschedulable,
Effect: v1.TaintEffectNoSchedule,
})
if !podToleratesUnschedulable {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonUnschedulable)
}
return nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodeUnschedulable{enableSchedulingQueueHint: fts.EnableSchedulingQueueHint}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,539 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodevolumelimits
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/rand"
corelisters "k8s.io/client-go/listers/core/v1"
storagelisters "k8s.io/client-go/listers/storage/v1"
ephemeral "k8s.io/component-helpers/storage/ephemeral"
storagehelpers "k8s.io/component-helpers/storage/volume"
csitrans "k8s.io/csi-translation-lib"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// ErrReasonMaxVolumeCountExceeded is used for MaxVolumeCount predicate error.
ErrReasonMaxVolumeCountExceeded = "node(s) exceed max volume count"
)
// InTreeToCSITranslator contains methods required to check migratable status
// and perform translations from InTree PV's to CSI
type InTreeToCSITranslator interface {
IsPVMigratable(pv *v1.PersistentVolume) bool
IsInlineMigratable(vol *v1.Volume) bool
IsMigratableIntreePluginByName(inTreePluginName string) bool
GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
GetCSINameFromInTreeName(pluginName string) (string, error)
TranslateInTreePVToCSI(logger klog.Logger, pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
TranslateInTreeInlineVolumeToCSI(logger klog.Logger, volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error)
}
// CSILimits is a plugin that checks node volume limits.
type CSILimits struct {
csiNodeLister storagelisters.CSINodeLister
pvLister corelisters.PersistentVolumeLister
pvcLister corelisters.PersistentVolumeClaimLister
scLister storagelisters.StorageClassLister
vaLister storagelisters.VolumeAttachmentLister
randomVolumeIDPrefix string
translator InTreeToCSITranslator
}
var _ framework.PreFilterPlugin = &CSILimits{}
var _ framework.FilterPlugin = &CSILimits{}
var _ framework.EnqueueExtensions = &CSILimits{}
// CSIName is the name of the plugin used in the plugin registry and configurations.
const CSIName = names.NodeVolumeLimits
// Name returns name of the plugin. It is used in logs, etc.
func (pl *CSILimits) Name() string {
return CSIName
}
// EventsToRegister returns the possible events that may make a Pod.
// failed by this plugin schedulable.
func (pl *CSILimits) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
return []framework.ClusterEventWithHint{
// We don't register any `QueueingHintFn` intentionally
// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterPVCAdded},
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}},
}, nil
}
func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
}
if len(deletedPod.Spec.Volumes) == 0 {
return framework.QueueSkip, nil
}
if deletedPod.Spec.NodeName == "" {
return framework.QueueSkip, nil
}
for _, vol := range deletedPod.Spec.Volumes {
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(&vol) {
return framework.Queue, nil
}
}
logger.V(5).Info("The deleted pod does not impact the scheduling of the unscheduled pod", "deletedPod", klog.KObj(pod), "pod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
func (pl *CSILimits) isSchedulableAfterPVCAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, addedPvc, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPVCAdded: %w", err)
}
if addedPvc.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
for _, volumes := range pod.Spec.Volumes {
var pvcName string
switch {
case volumes.PersistentVolumeClaim != nil:
pvcName = volumes.PersistentVolumeClaim.ClaimName
case volumes.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &volumes)
default:
// Volume is not using a PVC, ignore
continue
}
if pvcName == addedPvc.Name {
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
return framework.Queue, nil
}
}
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
return framework.QueueSkip, nil
}
// PreFilter invoked at the prefilter extension point
//
// If the pod haven't those types of volumes, we'll skip the Filter phase
func (pl *CSILimits) PreFilter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
volumes := pod.Spec.Volumes
for i := range volumes {
vol := &volumes[i]
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(vol) {
return nil, nil
}
}
return nil, framework.NewStatus(framework.Skip)
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *CSILimits) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter invoked at the filter extension point.
func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
// If the new pod doesn't have any volume attached to it, the predicate will always be true
if len(pod.Spec.Volumes) == 0 {
return nil
}
node := nodeInfo.Node()
logger := klog.FromContext(ctx)
csiNode, err := pl.csiNodeLister.Get(node.Name)
if err != nil {
// TODO: return the error once CSINode is created by default (2 releases)
logger.V(5).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
}
// Count CSI volumes from the new pod
newVolumes := make(map[string]string)
if err := pl.filterAttachableVolumes(logger, pod, csiNode, true /* new pod */, newVolumes); err != nil {
if apierrors.IsNotFound(err) {
// PVC is not found. This Pod will never be schedulable until PVC is created.
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return framework.AsStatus(err)
}
// If the pod doesn't have any new CSI volumes, the predicate will always be true
if len(newVolumes) == 0 {
return nil
}
// If the node doesn't have volume limits, the predicate will always be true
nodeVolumeLimits := getVolumeLimits(csiNode)
if len(nodeVolumeLimits) == 0 {
return nil
}
// Count CSI volumes from existing pods
attachedVolumes := make(map[string]string)
for _, existingPod := range nodeInfo.Pods {
if err := pl.filterAttachableVolumes(logger, existingPod.Pod, csiNode, false /* existing pod */, attachedVolumes); err != nil {
return framework.AsStatus(err)
}
}
attachedVolumeCount := map[string]int{}
for volumeUniqueName, driverName := range attachedVolumes {
// Don't count single volume used in multiple pods more than once
delete(newVolumes, volumeUniqueName)
attachedVolumeCount[driverName]++
}
// Count CSI volumes from VolumeAttachments
volumeAttachments, err := pl.getNodeVolumeAttachmentInfo(logger, node.Name)
if err != nil {
return framework.AsStatus(err)
}
for volumeUniqueName, driverName := range volumeAttachments {
// Avoid double-counting volumes already used by existing pods
if _, exists := attachedVolumes[volumeUniqueName]; !exists {
attachedVolumeCount[driverName]++
}
}
// Count the new volumes count per driver
newVolumeCount := map[string]int{}
for _, driverName := range newVolumes {
newVolumeCount[driverName]++
}
for driverName, count := range newVolumeCount {
maxVolumeLimit, ok := nodeVolumeLimits[driverName]
if ok {
currentVolumeCount := attachedVolumeCount[driverName]
logger.V(5).Info("Found plugin volume limits", "node", node.Name, "driverName", driverName,
"maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count,
"pod", klog.KObj(pod))
if currentVolumeCount+count > int(maxVolumeLimit) {
return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded)
}
}
}
return nil
}
// filterAttachableVolumes filters the attachable volumes from the pod and adds them to the result map.
// The result map is a map of volumeUniqueName to driver name. The volumeUniqueName is a unique name for
// the volume in the format of "driverName/volumeHandle". And driver name is the CSI driver name.
func (pl *CSILimits) filterAttachableVolumes(
logger klog.Logger, pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error {
for _, vol := range pod.Spec.Volumes {
pvcName := ""
isEphemeral := false
switch {
case vol.PersistentVolumeClaim != nil:
// Normal CSI volume can only be used through PVC
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
// Generic ephemeral inline volumes also use a PVC,
// just with a computed name and certain ownership.
// That is checked below once the pvc object is
// retrieved.
pvcName = ephemeral.VolumeClaimName(pod, &vol)
isEphemeral = true
default:
// Inline Volume does not have PVC.
// Need to check if CSI migration is enabled for this inline volume.
// - If the volume is migratable and CSI migration is enabled, need to count it
// as well.
// - If the volume is not migratable, it will be count in non_csi filter.
if err := pl.checkAttachableInlineVolume(logger, &vol, csiNode, pod, result); err != nil {
return err
}
continue
}
if pvcName == "" {
return fmt.Errorf("PersistentVolumeClaim had no name")
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if err != nil {
if newPod {
// The PVC is required to proceed with
// scheduling of a new pod because it cannot
// run without it. Bail out immediately.
return fmt.Errorf("looking up PVC %s/%s: %w", pod.Namespace, pvcName, err)
}
// If the PVC is invalid, we don't count the volume because
// there's no guarantee that it belongs to the running predicate.
logger.V(5).Info("Unable to look up PVC info", "pod", klog.KObj(pod), "PVC", klog.KRef(pod.Namespace, pvcName))
continue
}
// The PVC for an ephemeral volume must be owned by the pod.
if isEphemeral {
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
return err
}
}
driverName, volumeHandle := pl.getCSIDriverInfo(logger, csiNode, pvc)
if driverName == "" || volumeHandle == "" {
logger.V(5).Info("Could not find a CSI driver name or volume handle, not counting volume")
continue
}
volumeUniqueName := getVolumeUniqueName(driverName, volumeHandle)
result[volumeUniqueName] = driverName
}
return nil
}
// checkAttachableInlineVolume takes an inline volume and add to the result map if the
// volume is migratable and CSI migration for this plugin has been enabled.
func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Volume, csiNode *storagev1.CSINode,
pod *v1.Pod, result map[string]string) error {
if !pl.translator.IsInlineMigratable(vol) {
return nil
}
// Check if the intree provisioner CSI migration has been enabled.
inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, vol)
if err != nil {
return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
}
if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
csiNodeName := ""
if csiNode != nil {
csiNodeName = csiNode.Name
}
logger.V(5).Info("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName,
"pod", klog.KObj(pod), "csiNode", csiNodeName)
return nil
}
// Do translation for the in-tree volume.
translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(logger, vol, pod.Namespace)
if err != nil || translatedPV == nil {
return fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
}
driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName)
if err != nil {
return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err)
}
// TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set,
// the volume does not support inline. Skip the count.
if translatedPV.Spec.PersistentVolumeSource.CSI == nil {
return nil
}
volumeUniqueName := getVolumeUniqueName(driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle)
result[volumeUniqueName] = driverName
return nil
}
// getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC.
// If the PVC is from a migrated in-tree plugin, this function will return
// the information of the CSI driver that the plugin has been migrated to.
func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
pvName := pvc.Spec.VolumeName
if pvName == "" {
logger.V(5).Info("Persistent volume had no name for claim", "PVC", klog.KObj(pvc))
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
}
pv, err := pl.pvLister.Get(pvName)
if err != nil {
logger.V(5).Info("Unable to look up PV info for PVC and PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvName))
// If we can't fetch PV associated with PVC, may be it got deleted
// or PVC was prebound to a PVC that hasn't been created yet.
// fallback to using StorageClass for volume counting
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
}
csiSource := pv.Spec.PersistentVolumeSource.CSI
if csiSource == nil {
// We make a fast path for non-CSI volumes that aren't migratable
if !pl.translator.IsPVMigratable(pv) {
return "", ""
}
pluginName, err := pl.translator.GetInTreePluginNameFromSpec(pv, nil)
if err != nil {
logger.V(5).Info("Unable to look up plugin name from PV spec", "err", err)
return "", ""
}
if !isCSIMigrationOn(csiNode, pluginName) {
logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
return "", ""
}
csiPV, err := pl.translator.TranslateInTreePVToCSI(logger, pv)
if err != nil {
logger.V(5).Info("Unable to translate in-tree volume to CSI", "err", err)
return "", ""
}
if csiPV.Spec.PersistentVolumeSource.CSI == nil {
logger.V(5).Info("Unable to get a valid volume source for translated PV", "PV", pvName)
return "", ""
}
csiSource = csiPV.Spec.PersistentVolumeSource.CSI
}
return csiSource.Driver, csiSource.VolumeHandle
}
// getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass.
func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
namespace := pvc.Namespace
pvcName := pvc.Name
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
// If StorageClass is not set or not found, then PVC must be using immediate binding mode
// and hence it must be bound before scheduling. So it is safe to not count it.
if scName == "" {
logger.V(5).Info("PVC has no StorageClass", "PVC", klog.KObj(pvc))
return "", ""
}
storageClass, err := pl.scLister.Get(scName)
if err != nil {
logger.V(5).Info("Could not get StorageClass for PVC", "PVC", klog.KObj(pvc), "err", err)
return "", ""
}
// We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the
// predicate and there is another pod on the same node that uses same volume, then we will overcount
// the volume and consider both volumes as different.
volumeHandle := fmt.Sprintf("%s-%s/%s", pl.randomVolumeIDPrefix, namespace, pvcName)
provisioner := storageClass.Provisioner
if pl.translator.IsMigratableIntreePluginByName(provisioner) {
if !isCSIMigrationOn(csiNode, provisioner) {
logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
return "", ""
}
driverName, err := pl.translator.GetCSINameFromInTreeName(provisioner)
if err != nil {
logger.V(5).Info("Unable to look up driver name from provisioner name", "provisioner", provisioner, "err", err)
return "", ""
}
return driverName, volumeHandle
}
return provisioner, volumeHandle
}
// NewCSI initializes a new plugin and returns it.
func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister()
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
vaLister := informerFactory.Storage().V1().VolumeAttachments().Lister()
csiTranslator := csitrans.New()
return &CSILimits{
csiNodeLister: csiNodesLister,
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
vaLister: vaLister,
randomVolumeIDPrefix: rand.String(32),
translator: csiTranslator,
}, nil
}
// getVolumeLimits reads the volume limits from CSINode object and returns a map of volume limits.
// The key is the driver name and the value is the maximum number of volumes that can be attached to the node.
// If a key is not found in the map, it means there is no limit for the driver on the node.
func getVolumeLimits(csiNode *storagev1.CSINode) map[string]int64 {
nodeVolumeLimits := make(map[string]int64)
if csiNode == nil {
return nodeVolumeLimits
}
for _, d := range csiNode.Spec.Drivers {
if d.Allocatable != nil && d.Allocatable.Count != nil {
nodeVolumeLimits[d.Name] = int64(*d.Allocatable.Count)
}
}
return nodeVolumeLimits
}
// getNodeVolumeAttachmentInfo returns a map of volumeID to driver name for the given node.
func (pl *CSILimits) getNodeVolumeAttachmentInfo(logger klog.Logger, nodeName string) (map[string]string, error) {
volumeAttachments := make(map[string]string)
vas, err := pl.vaLister.List(labels.Everything())
if err != nil {
return nil, err
}
for _, va := range vas {
if va.Spec.NodeName == nodeName {
if va.Spec.Attacher == "" {
logger.V(5).Info("VolumeAttachment has no attacher", "VolumeAttachment", klog.KObj(va))
continue
}
if va.Spec.Source.PersistentVolumeName == nil {
logger.V(5).Info("VolumeAttachment has no PV name", "VolumeAttachment", klog.KObj(va))
continue
}
pv, err := pl.pvLister.Get(*va.Spec.Source.PersistentVolumeName)
if err != nil {
logger.V(5).Info("Unable to get PV for VolumeAttachment", "VolumeAttachment", klog.KObj(va), "err", err)
continue
}
if pv.Spec.CSI == nil {
logger.V(5).Info("PV is not a CSI volume", "PV", klog.KObj(pv))
continue
}
volumeID := getVolumeUniqueName(va.Spec.Attacher, pv.Spec.CSI.VolumeHandle)
volumeAttachments[volumeID] = va.Spec.Attacher
}
}
return volumeAttachments, nil
}
func getVolumeUniqueName(driverName, volumeHandle string) string {
return fmt.Sprintf("%s/%s", driverName, volumeHandle)
}

View File

@ -0,0 +1,73 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodevolumelimits
import (
"strings"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
csilibplugins "k8s.io/csi-translation-lib/plugins"
"k8s.io/kubernetes/pkg/features"
)
// isCSIMigrationOn returns a boolean value indicating whether
// the CSI migration has been enabled for a particular storage plugin.
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
if csiNode == nil || len(pluginName) == 0 {
return false
}
// In-tree storage to CSI driver migration feature should be enabled,
// along with the plugin-specific one
switch pluginName {
case csilibplugins.AWSEBSInTreePluginName:
return true
case csilibplugins.PortworxVolumePluginName:
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) {
return false
}
case csilibplugins.GCEPDInTreePluginName:
return true
case csilibplugins.AzureDiskInTreePluginName:
return true
case csilibplugins.CinderInTreePluginName:
return true
default:
return false
}
// The plugin name should be listed in the CSINode object annotation.
// This indicates that the plugin has been migrated to a CSI driver in the node.
csiNodeAnn := csiNode.GetAnnotations()
if csiNodeAnn == nil {
return false
}
var mpaSet sets.Set[string]
mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
if len(mpa) == 0 {
mpaSet = sets.New[string]()
} else {
tok := strings.Split(mpa, ",")
mpaSet = sets.New(tok...)
}
return mpaSet.Has(pluginName)
}

View File

@ -0,0 +1,174 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/utils/ptr"
)
type topologyPair struct {
key string
value string
}
// topologySpreadConstraint is an internal version for v1.TopologySpreadConstraint
// and where the selector is parsed.
// Fields are exported for comparison during testing.
type topologySpreadConstraint struct {
MaxSkew int32
TopologyKey string
Selector labels.Selector
MinDomains int32
NodeAffinityPolicy v1.NodeInclusionPolicy
NodeTaintsPolicy v1.NodeInclusionPolicy
}
func (tsc *topologySpreadConstraint) matchNodeInclusionPolicies(pod *v1.Pod, node *v1.Node, require nodeaffinity.RequiredNodeAffinity) bool {
if tsc.NodeAffinityPolicy == v1.NodeInclusionPolicyHonor {
// We ignore parsing errors here for backwards compatibility.
if match, _ := require.Match(node); !match {
return false
}
}
if tsc.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
if _, untolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc()); untolerated {
return false
}
}
return true
}
// buildDefaultConstraints builds the constraints for a pod using
// .DefaultConstraints and the selectors from the services, replication
// controllers, replica sets and stateful sets that match the pod.
func (pl *PodTopologySpread) buildDefaultConstraints(p *v1.Pod, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
constraints, err := pl.filterTopologySpreadConstraints(pl.defaultConstraints, p.Labels, action)
if err != nil || len(constraints) == 0 {
return nil, err
}
selector := helper.DefaultSelector(p, pl.services, pl.replicationCtrls, pl.replicaSets, pl.statefulSets)
if selector.Empty() {
return nil, nil
}
for i := range constraints {
constraints[i].Selector = selector
}
return constraints, nil
}
// nodeLabelsMatchSpreadConstraints checks if ALL topology keys in spread Constraints are present in node labels.
func nodeLabelsMatchSpreadConstraints(nodeLabels map[string]string, constraints []topologySpreadConstraint) bool {
for _, c := range constraints {
if _, ok := nodeLabels[c.TopologyKey]; !ok {
return false
}
}
return true
}
func (pl *PodTopologySpread) filterTopologySpreadConstraints(constraints []v1.TopologySpreadConstraint, podLabels map[string]string, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
var result []topologySpreadConstraint
for _, c := range constraints {
if c.WhenUnsatisfiable == action {
selector, err := metav1.LabelSelectorAsSelector(c.LabelSelector)
if err != nil {
return nil, err
}
if pl.enableMatchLabelKeysInPodTopologySpread && len(c.MatchLabelKeys) > 0 {
matchLabels := make(labels.Set)
for _, labelKey := range c.MatchLabelKeys {
if value, ok := podLabels[labelKey]; ok {
matchLabels[labelKey] = value
}
}
if len(matchLabels) > 0 {
selector = mergeLabelSetWithSelector(matchLabels, selector)
}
}
tsc := topologySpreadConstraint{
MaxSkew: c.MaxSkew,
TopologyKey: c.TopologyKey,
Selector: selector,
MinDomains: ptr.Deref(c.MinDomains, 1), // If MinDomains is nil, we treat MinDomains as 1.
NodeAffinityPolicy: v1.NodeInclusionPolicyHonor, // If NodeAffinityPolicy is nil, we treat NodeAffinityPolicy as "Honor".
NodeTaintsPolicy: v1.NodeInclusionPolicyIgnore, // If NodeTaintsPolicy is nil, we treat NodeTaintsPolicy as "Ignore".
}
if pl.enableNodeInclusionPolicyInPodTopologySpread {
if c.NodeAffinityPolicy != nil {
tsc.NodeAffinityPolicy = *c.NodeAffinityPolicy
}
if c.NodeTaintsPolicy != nil {
tsc.NodeTaintsPolicy = *c.NodeTaintsPolicy
}
}
result = append(result, tsc)
}
}
return result, nil
}
func mergeLabelSetWithSelector(matchLabels labels.Set, s labels.Selector) labels.Selector {
mergedSelector := labels.SelectorFromSet(matchLabels)
requirements, ok := s.Requirements()
if !ok {
return s
}
for _, r := range requirements {
mergedSelector = mergedSelector.Add(r)
}
return mergedSelector
}
func countPodsMatchSelector(podInfos []*framework.PodInfo, selector labels.Selector, ns string) int {
if selector.Empty() {
return 0
}
count := 0
for _, p := range podInfos {
// Bypass terminating Pod (see #87621).
if p.Pod.DeletionTimestamp != nil || p.Pod.Namespace != ns {
continue
}
if selector.Matches(labels.Set(p.Pod.Labels)) {
count++
}
}
return count
}
// podLabelsMatchSpreadConstraints returns whether tha labels matches with the selector in any of topologySpreadConstraint
func podLabelsMatchSpreadConstraints(constraints []topologySpreadConstraint, labels labels.Set) bool {
for _, c := range constraints {
if c.Selector.Matches(labels) {
return true
}
}
return false
}

View File

@ -0,0 +1,371 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const preFilterStateKey = "PreFilter" + Name
// preFilterState computed at PreFilter and used at Filter.
// It combines TpKeyToCriticalPaths and TpPairToMatchNum to represent:
// (1) critical paths where the least pods are matched on each spread constraint.
// (2) number of pods matched on each spread constraint.
// A nil preFilterState denotes it's not set at all (in PreFilter phase);
// An empty preFilterState object denotes it's a legit state and is set in PreFilter phase.
// Fields are exported for comparison during testing.
type preFilterState struct {
Constraints []topologySpreadConstraint
// We record 2 critical paths instead of all critical paths here.
// criticalPaths[0].MatchNum always holds the minimum matching number.
// criticalPaths[1].MatchNum is always greater or equal to criticalPaths[0].MatchNum, but
// it's not guaranteed to be the 2nd minimum match number.
TpKeyToCriticalPaths map[string]*criticalPaths
// TpKeyToDomainsNum is keyed with topologyKey, and valued with the number of domains.
TpKeyToDomainsNum map[string]int
// TpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
TpPairToMatchNum map[topologyPair]int
}
// minMatchNum returns the global minimum for the calculation of skew while taking MinDomains into account.
func (s *preFilterState) minMatchNum(tpKey string, minDomains int32) (int, error) {
paths, ok := s.TpKeyToCriticalPaths[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve path by topology key")
}
minMatchNum := paths[0].MatchNum
domainsNum, ok := s.TpKeyToDomainsNum[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve the number of domains by topology key")
}
if domainsNum < int(minDomains) {
// When the number of eligible domains with matching topology keys is less than `minDomains`,
// it treats "global minimum" as 0.
minMatchNum = 0
}
return minMatchNum, nil
}
// Clone makes a copy of the given state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
copy := preFilterState{
// Constraints are shared because they don't change.
Constraints: s.Constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(s.TpKeyToCriticalPaths)),
// The number of domains does not change as a result of AddPod/RemovePod methods on PreFilter Extensions
TpKeyToDomainsNum: s.TpKeyToDomainsNum,
TpPairToMatchNum: make(map[topologyPair]int, len(s.TpPairToMatchNum)),
}
for tpKey, paths := range s.TpKeyToCriticalPaths {
copy.TpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
}
for tpPair, matchNum := range s.TpPairToMatchNum {
copy.TpPairToMatchNum[tpPair] = matchNum
}
return &copy
}
// CAVEAT: the reason that `[2]criticalPath` can work is based on the implementation of current
// preemption algorithm, in particular the following 2 facts:
// Fact 1: we only preempt pods on the same node, instead of pods on multiple nodes.
// Fact 2: each node is evaluated on a separate copy of the preFilterState during its preemption cycle.
// If we plan to turn to a more complex algorithm like "arbitrary pods on multiple nodes", this
// structure needs to be revisited.
// Fields are exported for comparison during testing.
type criticalPaths [2]struct {
// TopologyValue denotes the topology value mapping to topology key.
TopologyValue string
// MatchNum denotes the number of matching pods.
MatchNum int
}
func newCriticalPaths() *criticalPaths {
return &criticalPaths{{MatchNum: math.MaxInt32}, {MatchNum: math.MaxInt32}}
}
func (p *criticalPaths) update(tpVal string, num int) {
// first verify if `tpVal` exists or not
i := -1
if tpVal == p[0].TopologyValue {
i = 0
} else if tpVal == p[1].TopologyValue {
i = 1
}
if i >= 0 {
// `tpVal` exists
p[i].MatchNum = num
if p[0].MatchNum > p[1].MatchNum {
// swap paths[0] and paths[1]
p[0], p[1] = p[1], p[0]
}
} else {
// `tpVal` doesn't exist
if num < p[0].MatchNum {
// update paths[1] with paths[0]
p[1] = p[0]
// update paths[0]
p[0].TopologyValue, p[0].MatchNum = tpVal, num
} else if num < p[1].MatchNum {
// update paths[1]
p[1].TopologyValue, p[1].MatchNum = tpVal, num
}
}
}
// PreFilter invoked at the prefilter extension point.
func (pl *PodTopologySpread) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
s, err := pl.calPreFilterState(ctx, pod)
if err != nil {
return nil, framework.AsStatus(err)
} else if s != nil && len(s.Constraints) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *PodTopologySpread) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// AddPod from pre-computed data in cycleState.
func (pl *PodTopologySpread) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
pl.updateWithPod(s, podInfoToAdd.Pod, podToSchedule, nodeInfo.Node(), 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *PodTopologySpread) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
pl.updateWithPod(s, podInfoToRemove.Pod, podToSchedule, nodeInfo.Node(), -1)
return nil
}
func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemptorPod *v1.Pod, node *v1.Node, delta int) {
if s == nil || updatedPod.Namespace != preemptorPod.Namespace || node == nil {
return
}
if !nodeLabelsMatchSpreadConstraints(node.Labels, s.Constraints) {
return
}
requiredSchedulingTerm := nodeaffinity.GetRequiredNodeAffinity(preemptorPod)
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// spreading is applied to nodes that pass those filters.
// Ignore parsing errors for backwards compatibility.
if match, _ := requiredSchedulingTerm.Match(node); !match {
return
}
}
podLabelSet := labels.Set(updatedPod.Labels)
for _, constraint := range s.Constraints {
if !constraint.Selector.Matches(podLabelSet) {
continue
}
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!constraint.matchNodeInclusionPolicies(preemptorPod, node, requiredSchedulingTerm) {
continue
}
k, v := constraint.TopologyKey, node.Labels[constraint.TopologyKey]
pair := topologyPair{key: k, value: v}
s.TpPairToMatchNum[pair] += delta
s.TpKeyToCriticalPaths[k].update(v, s.TpPairToMatchNum[pair])
}
}
// getPreFilterState fetches a pre-computed preFilterState.
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to podtopologyspread.preFilterState error", c)
}
return s, nil
}
// calPreFilterState computes preFilterState describing how pods are spread on topologies.
func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) (*preFilterState, error) {
constraints, err := pl.getConstraints(pod)
if err != nil {
return nil, fmt.Errorf("get constraints from pod: %w", err)
}
if len(constraints) == 0 {
return &preFilterState{}, nil
}
allNodes, err := pl.sharedLister.NodeInfos().List()
if err != nil {
return nil, fmt.Errorf("listing NodeInfos: %w", err)
}
s := preFilterState{
Constraints: constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
TpPairToMatchNum: make(map[topologyPair]int, sizeHeuristic(len(allNodes), constraints)),
}
tpCountsByNode := make([]map[topologyPair]int, len(allNodes))
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// spreading is applied to nodes that pass those filters.
// Ignore parsing errors for backwards compatibility.
if match, _ := requiredNodeAffinity.Match(node); !match {
return
}
}
// Ensure current node's labels contains all topologyKeys in 'Constraints'.
if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) {
return
}
tpCounts := make(map[topologyPair]int, len(constraints))
for _, c := range constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
tpCounts[pair] = count
}
tpCountsByNode[i] = tpCounts
}
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for _, tpCounts := range tpCountsByNode {
for tp, count := range tpCounts {
s.TpPairToMatchNum[tp] += count
}
}
s.TpKeyToDomainsNum = make(map[string]int, len(constraints))
for tp := range s.TpPairToMatchNum {
s.TpKeyToDomainsNum[tp.key]++
}
// calculate min match for each topology pair
for i := 0; i < len(constraints); i++ {
key := constraints[i].TopologyKey
s.TpKeyToCriticalPaths[key] = newCriticalPaths()
}
for pair, num := range s.TpPairToMatchNum {
s.TpKeyToCriticalPaths[pair.key].update(pair.value, num)
}
return &s, nil
}
// Filter invoked at the filter extension point.
func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
// However, "empty" preFilterState is legit which tolerates every toSchedule Pod.
if len(s.Constraints) == 0 {
return nil
}
logger := klog.FromContext(ctx)
podLabelSet := labels.Set(pod.Labels)
for _, c := range s.Constraints {
tpKey := c.TopologyKey
tpVal, ok := node.Labels[c.TopologyKey]
if !ok {
logger.V(5).Info("Node doesn't have required label", "node", klog.KObj(node), "label", tpKey)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonNodeLabelNotMatch)
}
// judging criteria:
// 'existing matching num' + 'if self-match (1 or 0)' - 'global minimum' <= 'maxSkew'
minMatchNum, err := s.minMatchNum(tpKey, c.MinDomains)
if err != nil {
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.TpKeyToCriticalPaths)
continue
}
selfMatchNum := 0
if c.Selector.Matches(podLabelSet) {
selfMatchNum = 1
}
pair := topologyPair{key: tpKey, value: tpVal}
matchNum := 0
if tpCount, ok := s.TpPairToMatchNum[pair]; ok {
matchNum = tpCount
}
skew := matchNum + selfMatchNum - minMatchNum
if skew > int(c.MaxSkew) {
logger.V(5).Info("Node failed spreadConstraint: matchNum + selfMatchNum - minMatchNum > maxSkew", "node", klog.KObj(node), "topologyKey", tpKey, "matchNum", matchNum, "selfMatchNum", selfMatchNum, "minMatchNum", minMatchNum, "maxSkew", c.MaxSkew)
return framework.NewStatus(framework.Unschedulable, ErrReasonConstraintsNotMatch)
}
}
return nil
}
func sizeHeuristic(nodes int, constraints []topologySpreadConstraint) int {
for _, c := range constraints {
if c.TopologyKey == v1.LabelHostname {
return nodes
}
}
return 0
}

View File

@ -0,0 +1,351 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
appslisters "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// ErrReasonConstraintsNotMatch is used for PodTopologySpread filter error.
ErrReasonConstraintsNotMatch = "node(s) didn't match pod topology spread constraints"
// ErrReasonNodeLabelNotMatch is used when the node doesn't hold the required label.
ErrReasonNodeLabelNotMatch = ErrReasonConstraintsNotMatch + " (missing required label)"
)
var systemDefaultConstraints = []v1.TopologySpreadConstraint{
{
TopologyKey: v1.LabelHostname,
WhenUnsatisfiable: v1.ScheduleAnyway,
MaxSkew: 3,
},
{
TopologyKey: v1.LabelTopologyZone,
WhenUnsatisfiable: v1.ScheduleAnyway,
MaxSkew: 5,
},
}
// PodTopologySpread is a plugin that ensures pod's topologySpreadConstraints is satisfied.
type PodTopologySpread struct {
systemDefaulted bool
parallelizer parallelize.Parallelizer
defaultConstraints []v1.TopologySpreadConstraint
sharedLister framework.SharedLister
services corelisters.ServiceLister
replicationCtrls corelisters.ReplicationControllerLister
replicaSets appslisters.ReplicaSetLister
statefulSets appslisters.StatefulSetLister
enableNodeInclusionPolicyInPodTopologySpread bool
enableMatchLabelKeysInPodTopologySpread bool
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &PodTopologySpread{}
var _ framework.FilterPlugin = &PodTopologySpread{}
var _ framework.PreScorePlugin = &PodTopologySpread{}
var _ framework.ScorePlugin = &PodTopologySpread{}
var _ framework.EnqueueExtensions = &PodTopologySpread{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.PodTopologySpread
// Name returns name of the plugin. It is used in logs, etc.
func (pl *PodTopologySpread) Name() string {
return Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
if h.SnapshotSharedLister() == nil {
return nil, fmt.Errorf("SnapshotSharedlister is nil")
}
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
if err := validation.ValidatePodTopologySpreadArgs(nil, &args); err != nil {
return nil, err
}
pl := &PodTopologySpread{
parallelizer: h.Parallelizer(),
sharedLister: h.SnapshotSharedLister(),
defaultConstraints: args.DefaultConstraints,
enableNodeInclusionPolicyInPodTopologySpread: fts.EnableNodeInclusionPolicyInPodTopologySpread,
enableMatchLabelKeysInPodTopologySpread: fts.EnableMatchLabelKeysInPodTopologySpread,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
if args.DefaultingType == config.SystemDefaulting {
pl.defaultConstraints = systemDefaultConstraints
pl.systemDefaulted = true
}
if len(pl.defaultConstraints) != 0 {
if h.SharedInformerFactory() == nil {
return nil, fmt.Errorf("SharedInformerFactory is nil")
}
pl.setListers(h.SharedInformerFactory())
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.PodTopologySpreadArgs, error) {
ptr, ok := obj.(*config.PodTopologySpreadArgs)
if !ok {
return config.PodTopologySpreadArgs{}, fmt.Errorf("want args to be of type PodTopologySpreadArgs, got %T", obj)
}
return *ptr, nil
}
func (pl *PodTopologySpread) setListers(factory informers.SharedInformerFactory) {
pl.services = factory.Core().V1().Services().Lister()
pl.replicationCtrls = factory.Core().V1().ReplicationControllers().Lister()
pl.replicaSets = factory.Apps().V1().ReplicaSets().Lister()
pl.statefulSets = factory.Apps().V1().StatefulSets().Lister()
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
podActionType := framework.Add | framework.UpdatePodLabel | framework.Delete
if pl.enableSchedulingQueueHint {
// When the QueueingHint feature is enabled, the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
// (If not, the scheduling queue always retries the unschedulable Pods when they're updated.)
//
// The Pod rejected by this plugin can be schedulable when the Pod has a spread constraint with NodeTaintsPolicy:Honor
// and has got a new toleration.
// So, we add UpdatePodTolerations here only when QHint is enabled.
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodTolerations | framework.Delete
}
return []framework.ClusterEventWithHint{
// All ActionType includes the following events:
// - Add. An unschedulable Pod may fail due to violating topology spread constraints,
// adding an assigned Pod may make it schedulable.
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
// an unschedulable Pod schedulable.
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's topology spread constraints,
// deleting an existing Pod may make it schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: pl.isSchedulableAfterPodChange},
// Node add|delete|update maybe lead an topology key changed,
// and make these pod in scheduling schedulable or unschedulable.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// involvedInTopologySpreading returns true if the incomingPod is involved in the topology spreading of podWithSpreading.
func involvedInTopologySpreading(incomingPod, podWithSpreading *v1.Pod) bool {
return incomingPod.UID == podWithSpreading.UID ||
(incomingPod.Spec.NodeName != "" && incomingPod.Namespace == podWithSpreading.Namespace)
}
// hasConstraintWithNodeTaintsPolicyHonor returns true if any constraint has `NodeTaintsPolicy: Honor`.
func hasConstraintWithNodeTaintsPolicyHonor(constraints []topologySpreadConstraint) bool {
for _, c := range constraints {
if c.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
return true
}
}
return false
}
func (pl *PodTopologySpread) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if (modifiedPod != nil && !involvedInTopologySpreading(modifiedPod, pod)) || (originalPod != nil && !involvedInTopologySpreading(originalPod, pod)) {
logger.V(5).Info("the added/updated/deleted pod is unscheduled or has different namespace with target pod, so it doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
constraints, err := pl.getConstraints(pod)
if err != nil {
return framework.Queue, err
}
// Pod is modified. Return Queue when the label(s) matching topologySpread's selector is added, changed, or deleted.
if modifiedPod != nil && originalPod != nil {
if pod.UID == modifiedPod.UID && !equality.Semantic.DeepEqual(modifiedPod.Spec.Tolerations, originalPod.Spec.Tolerations) && hasConstraintWithNodeTaintsPolicyHonor(constraints) {
// If any constraint has `NodeTaintsPolicy: Honor`, we can return Queue when the target Pod has got a new toleration.
logger.V(5).Info("the unschedulable pod has got a new toleration, which could make it schedulable",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
if equality.Semantic.DeepEqual(modifiedPod.Labels, originalPod.Labels) {
logger.V(5).Info("the pod's update doesn't include the label update, which doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
for _, c := range constraints {
if c.Selector.Matches(labels.Set(originalPod.Labels)) != c.Selector.Matches(labels.Set(modifiedPod.Labels)) {
// This modification makes this Pod match(or not match) with this constraint.
// Maybe now the scheduling result of topology spread gets changed by this change.
logger.V(5).Info("a scheduled pod's label was updated and it makes the updated pod match or unmatch the pod's topology spread constraints",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
}
// This modification of labels doesn't change whether this Pod would match selector or not in any constraints.
logger.V(5).Info("a scheduled pod's label was updated, but it's a change unrelated to the pod's topology spread constraints",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is added. Return Queue when the added Pod has a label that matches with topologySpread's selector.
if modifiedPod != nil {
if podLabelsMatchSpreadConstraints(constraints, modifiedPod.Labels) {
logger.V(5).Info("a scheduled pod was created and it matches with the pod's topology spread constraints",
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was created, but it doesn't matches with the pod's topology spread constraints",
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is deleted. Return Queue when the deleted Pod has a label that matches with topologySpread's selector.
if podLabelsMatchSpreadConstraints(constraints, originalPod.Labels) {
logger.V(5).Info("a scheduled pod which matches with the pod's topology spread constraints was deleted, and the pod may be schedulable now",
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was deleted, but it's unrelated to the pod's topology spread constraints",
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
// getConstraints extracts topologySpreadConstraint(s) from the Pod spec.
// If the Pod doesn't have any topologySpreadConstraint, it returns default constraints.
func (pl *PodTopologySpread) getConstraints(pod *v1.Pod) ([]topologySpreadConstraint, error) {
var constraints []topologySpreadConstraint
var err error
if len(pod.Spec.TopologySpreadConstraints) > 0 {
// We have feature gating in APIServer to strip the spec
// so don't need to re-check feature gate, just check length of Constraints.
constraints, err = pl.filterTopologySpreadConstraints(
pod.Spec.TopologySpreadConstraints,
pod.Labels,
v1.DoNotSchedule,
)
if err != nil {
return nil, fmt.Errorf("obtaining pod's hard topology spread constraints: %w", err)
}
} else {
constraints, err = pl.buildDefaultConstraints(pod, v1.DoNotSchedule)
if err != nil {
return nil, fmt.Errorf("setting default hard topology spread constraints: %w", err)
}
}
return constraints, nil
}
// isSchedulableAfterNodeChange returns Queue when node has topologyKey in its labels, else return QueueSkip.
func (pl *PodTopologySpread) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
constraints, err := pl.getConstraints(pod)
if err != nil {
return framework.Queue, err
}
var originalNodeMatching, modifiedNodeMatching bool
if originalNode != nil {
originalNodeMatching = nodeLabelsMatchSpreadConstraints(originalNode.Labels, constraints)
}
if modifiedNode != nil {
modifiedNodeMatching = nodeLabelsMatchSpreadConstraints(modifiedNode.Labels, constraints)
}
// We return Queue in the following cases:
// 1. Node/UpdateNodeLabel:
// - The original node matched the pod's topology spread constraints, but the modified node does not.
// - The modified node matches the pod's topology spread constraints, but the original node does not.
// - The modified node matches the pod's topology spread constraints, and the original node and the modified node have different label values for any topologyKey.
// 2. Node/UpdateNodeTaint:
// - The modified node match the pod's topology spread constraints, and the original node and the modified node have different taints.
// 3. Node/Add: The created node matches the pod's topology spread constraints.
// 4. Node/Delete: The original node matched the pod's topology spread constraints.
if originalNode != nil && modifiedNode != nil {
if originalNodeMatching != modifiedNodeMatching {
logger.V(5).Info("the node is updated and now pod topology spread constraints has changed, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode), "originalMatching", originalNodeMatching, "newMatching", modifiedNodeMatching)
return framework.Queue, nil
}
if modifiedNodeMatching && (checkTopologyKeyLabelsChanged(originalNode.Labels, modifiedNode.Labels, constraints) || !equality.Semantic.DeepEqual(originalNode.Spec.Taints, modifiedNode.Spec.Taints)) {
logger.V(5).Info("the node is updated and now has different taints or labels, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
return framework.QueueSkip, nil
}
if modifiedNode != nil {
if !modifiedNodeMatching {
logger.V(5).Info("the created node doesn't match pod topology spread constraints",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("the created node matches topology spread constraints, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
if !originalNodeMatching {
logger.V(5).Info("the deleted node doesn't match pod topology spread constraints", "pod", klog.KObj(pod), "node", klog.KObj(originalNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("the deleted node matches topology spread constraints, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(originalNode))
return framework.Queue, nil
}
// checkTopologyKeyLabelsChanged checks if any of the labels specified as topologyKey in the constraints have changed.
func checkTopologyKeyLabelsChanged(originalLabels, modifiedLabels map[string]string, constraints []topologySpreadConstraint) bool {
for _, constraint := range constraints {
topologyKey := constraint.TopologyKey
if originalLabels[topologyKey] != modifiedLabels[topologyKey] {
return true
}
}
return false
}

View File

@ -0,0 +1,305 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
"math"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const preScoreStateKey = "PreScore" + Name
const invalidScore = -1
// preScoreState computed at PreScore and used at Score.
// Fields are exported for comparison during testing.
type preScoreState struct {
Constraints []topologySpreadConstraint
// IgnoredNodes is a set of node names which miss some Constraints[*].topologyKey.
IgnoredNodes sets.Set[string]
// TopologyPairToPodCounts is keyed with topologyPair, and valued with the number of matching pods.
TopologyPairToPodCounts map[topologyPair]*int64
// TopologyNormalizingWeight is the weight we give to the counts per topology.
// This allows the pod counts of smaller topologies to not be watered down by
// bigger ones.
TopologyNormalizingWeight []float64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// initPreScoreState iterates "filteredNodes" to filter out the nodes which
// don't have required topologyKey(s), and initialize:
// 1) s.TopologyPairToPodCounts: keyed with both eligible topology pair and node names.
// 2) s.IgnoredNodes: the set of nodes that shouldn't be scored.
// 3) s.TopologyNormalizingWeight: The weight to be given to each constraint based on the number of values in a topology.
func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, filteredNodes []*framework.NodeInfo, requireAllTopologies bool) error {
var err error
if len(pod.Spec.TopologySpreadConstraints) > 0 {
s.Constraints, err = pl.filterTopologySpreadConstraints(
pod.Spec.TopologySpreadConstraints,
pod.Labels,
v1.ScheduleAnyway,
)
if err != nil {
return fmt.Errorf("obtaining pod's soft topology spread constraints: %w", err)
}
} else {
s.Constraints, err = pl.buildDefaultConstraints(pod, v1.ScheduleAnyway)
if err != nil {
return fmt.Errorf("setting default soft topology spread constraints: %w", err)
}
}
if len(s.Constraints) == 0 {
return nil
}
topoSize := make([]int, len(s.Constraints))
for _, node := range filteredNodes {
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Node().Labels, s.Constraints) {
// Nodes which don't have all required topologyKeys present are ignored
// when scoring later.
s.IgnoredNodes.Insert(node.Node().Name)
continue
}
for i, constraint := range s.Constraints {
// per-node counts are calculated during Score.
if constraint.TopologyKey == v1.LabelHostname {
continue
}
pair := topologyPair{key: constraint.TopologyKey, value: node.Node().Labels[constraint.TopologyKey]}
if s.TopologyPairToPodCounts[pair] == nil {
s.TopologyPairToPodCounts[pair] = new(int64)
topoSize[i]++
}
}
}
s.TopologyNormalizingWeight = make([]float64, len(s.Constraints))
for i, c := range s.Constraints {
sz := topoSize[i]
if c.TopologyKey == v1.LabelHostname {
sz = len(filteredNodes) - len(s.IgnoredNodes)
}
s.TopologyNormalizingWeight[i] = topologyNormalizingWeight(sz)
}
return nil
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *PodTopologySpread) PreScore(
ctx context.Context,
cycleState *framework.CycleState,
pod *v1.Pod,
filteredNodes []*framework.NodeInfo,
) *framework.Status {
allNodes, err := pl.sharedLister.NodeInfos().List()
if err != nil {
return framework.AsStatus(fmt.Errorf("getting all nodes: %w", err))
}
if len(allNodes) == 0 {
// No need to score.
return framework.NewStatus(framework.Skip)
}
state := &preScoreState{
IgnoredNodes: sets.New[string](),
TopologyPairToPodCounts: make(map[topologyPair]*int64),
}
// Only require that nodes have all the topology labels if using
// non-system-default spreading rules. This allows nodes that don't have a
// zone label to still have hostname spreading.
requireAllTopologies := len(pod.Spec.TopologySpreadConstraints) > 0 || !pl.systemDefaulted
err = pl.initPreScoreState(state, pod, filteredNodes, requireAllTopologies)
if err != nil {
return framework.AsStatus(fmt.Errorf("calculating preScoreState: %w", err))
}
// return Skip if incoming pod doesn't have soft topology spread Constraints.
if len(state.Constraints) == 0 {
return framework.NewStatus(framework.Skip)
}
// Ignore parsing errors for backwards compatibility.
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processAllNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// `node` should satisfy incoming pod's NodeSelector/NodeAffinity
if match, _ := requiredNodeAffinity.Match(node); !match {
return
}
}
// All topologyKeys need to be present in `node`
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Labels, state.Constraints) {
return
}
for _, c := range state.Constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
// If current topology pair is not associated with any candidate node,
// continue to avoid unnecessary calculation.
// Per-node counts are also skipped, as they are done during Score.
tpCount := state.TopologyPairToPodCounts[pair]
if tpCount == nil {
continue
}
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
atomic.AddInt64(tpCount, int64(count))
}
}
pl.parallelizer.Until(ctx, len(allNodes), processAllNode, pl.Name())
cycleState.Write(preScoreStateKey, state)
return nil
}
// Score invoked at the Score extension point.
// The "score" returned in this function is the matching number of pods on the `nodeName`,
// it is normalized later.
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)
if err != nil {
return 0, framework.AsStatus(err)
}
// Return if the node is not qualified.
if s.IgnoredNodes.Has(node.Name) {
return 0, nil
}
// For each present <pair>, current node gets a credit of <matchSum>.
// And we sum up <matchSum> and return it as this node's score.
var score float64
for i, c := range s.Constraints {
if tpVal, ok := node.Labels[c.TopologyKey]; ok {
var cnt int64
if c.TopologyKey == v1.LabelHostname {
cnt = int64(countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace))
} else {
pair := topologyPair{key: c.TopologyKey, value: tpVal}
cnt = *s.TopologyPairToPodCounts[pair]
}
score += scoreForCount(cnt, c.MaxSkew, s.TopologyNormalizingWeight[i])
}
}
return int64(math.Round(score)), nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *PodTopologySpread) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
s, err := getPreScoreState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if s == nil {
return nil
}
// Calculate <minScore> and <maxScore>
var minScore int64 = math.MaxInt64
var maxScore int64
for i, score := range scores {
// it's mandatory to check if <score.Name> is present in m.IgnoredNodes
if s.IgnoredNodes.Has(score.Name) {
scores[i].Score = invalidScore
continue
}
if score.Score < minScore {
minScore = score.Score
}
if score.Score > maxScore {
maxScore = score.Score
}
}
for i := range scores {
if scores[i].Score == invalidScore {
scores[i].Score = 0
continue
}
if maxScore == 0 {
scores[i].Score = framework.MaxNodeScore
continue
}
s := scores[i].Score
scores[i].Score = framework.MaxNodeScore * (maxScore + minScore - s) / maxScore
}
return nil
}
// ScoreExtensions of the Score plugin.
func (pl *PodTopologySpread) ScoreExtensions() framework.ScoreExtensions {
return pl
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("error reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to podtopologyspread.preScoreState error", c)
}
return s, nil
}
// topologyNormalizingWeight calculates the weight for the topology, based on
// the number of values that exist for a topology.
// Since <size> is at least 1 (all nodes that passed the Filters are in the
// same topology), and k8s supports 5k nodes, the result is in the interval
// <1.09, 8.52>.
//
// Note: <size> could also be zero when no nodes have the required topologies,
// however we don't care about topology weight in this case as we return a 0
// score for all nodes.
func topologyNormalizingWeight(size int) float64 {
return math.Log(float64(size + 2))
}
// scoreForCount calculates the score based on number of matching pods in a
// topology domain, the constraint's maxSkew and the topology weight.
// `maxSkew-1` is added to the score so that differences between topology
// domains get watered down, controlling the tolerance of the score to skews.
func scoreForCount(cnt int64, maxSkew int32, tpWeight float64) float64 {
return float64(cnt)*tpWeight + float64(maxSkew-1)
}

View File

@ -0,0 +1,53 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queuesort
import (
"context"
"k8s.io/apimachinery/pkg/runtime"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.PrioritySort
// PrioritySort is a plugin that implements Priority based sorting.
type PrioritySort struct{}
var _ framework.QueueSortPlugin = &PrioritySort{}
// Name returns name of the plugin.
func (pl *PrioritySort) Name() string {
return Name
}
// Less is the function used by the activeQ heap algorithm to sort pods.
// It sorts pods based on their priority. When priorities are equal, it uses
// PodQueueInfo.timestamp.
func (pl *PrioritySort) Less(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
p1 := corev1helpers.PodPriority(pInfo1.Pod)
p2 := corev1helpers.PodPriority(pInfo2.Pod)
return (p1 > p2) || (p1 == p2 && pInfo1.Timestamp.Before(pInfo2.Timestamp))
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
return &PrioritySort{}, nil
}

View File

@ -0,0 +1,84 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugins
import (
"k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources"
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone"
"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
)
// NewInTreeRegistry builds the registry with all the in-tree plugins.
// A scheduler that runs out of tree plugins can register additional plugins
// through the WithFrameworkOutOfTreeRegistry option.
func NewInTreeRegistry() runtime.Registry {
fts := plfeature.Features{
EnableDRAAdminAccess: feature.DefaultFeatureGate.Enabled(features.DRAAdminAccess),
EnableDynamicResourceAllocation: feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
EnableVolumeCapacityPriority: feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
EnablePodLevelResources: feature.DefaultFeatureGate.Enabled(features.PodLevelResources),
}
registry := runtime.Registry{
dynamicresources.Name: runtime.FactoryAdapter(fts, dynamicresources.New),
imagelocality.Name: imagelocality.New,
tainttoleration.Name: runtime.FactoryAdapter(fts, tainttoleration.New),
nodename.Name: runtime.FactoryAdapter(fts, nodename.New),
nodeports.Name: runtime.FactoryAdapter(fts, nodeports.New),
nodeaffinity.Name: runtime.FactoryAdapter(fts, nodeaffinity.New),
podtopologyspread.Name: runtime.FactoryAdapter(fts, podtopologyspread.New),
nodeunschedulable.Name: runtime.FactoryAdapter(fts, nodeunschedulable.New),
noderesources.Name: runtime.FactoryAdapter(fts, noderesources.NewFit),
noderesources.BalancedAllocationName: runtime.FactoryAdapter(fts, noderesources.NewBalancedAllocation),
volumebinding.Name: runtime.FactoryAdapter(fts, volumebinding.New),
volumerestrictions.Name: runtime.FactoryAdapter(fts, volumerestrictions.New),
volumezone.Name: runtime.FactoryAdapter(fts, volumezone.New),
nodevolumelimits.CSIName: runtime.FactoryAdapter(fts, nodevolumelimits.NewCSI),
interpodaffinity.Name: runtime.FactoryAdapter(fts, interpodaffinity.New),
queuesort.Name: queuesort.New,
defaultbinder.Name: defaultbinder.New,
defaultpreemption.Name: runtime.FactoryAdapter(fts, defaultpreemption.New),
schedulinggates.Name: runtime.FactoryAdapter(fts, schedulinggates.New),
}
return registry
}

View File

@ -0,0 +1,94 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package schedulinggates
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.SchedulingGates
// SchedulingGates checks if a Pod carries .spec.schedulingGates.
type SchedulingGates struct {
enableSchedulingQueueHint bool
}
var _ framework.PreEnqueuePlugin = &SchedulingGates{}
var _ framework.EnqueueExtensions = &SchedulingGates{}
func (pl *SchedulingGates) Name() string {
return Name
}
func (pl *SchedulingGates) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
if len(p.Spec.SchedulingGates) == 0 {
return nil
}
gates := make([]string, 0, len(p.Spec.SchedulingGates))
for _, gate := range p.Spec.SchedulingGates {
gates = append(gates, gate.Name)
}
return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("waiting for scheduling gates: %v", gates))
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *SchedulingGates) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enableSchedulingQueueHint {
return nil, nil
}
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
return []framework.ClusterEventWithHint{
// Pods can be more schedulable once it's gates are removed
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodSchedulingGatesEliminated}, QueueingHintFn: pl.isSchedulableAfterUpdatePodSchedulingGatesEliminated},
}, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &SchedulingGates{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}
func (pl *SchedulingGates) isSchedulableAfterUpdatePodSchedulingGatesEliminated(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if modifiedPod.UID != pod.UID {
// If the update event is not for targetPod, it wouldn't make targetPod schedulable.
return framework.QueueSkip, nil
}
return framework.Queue, nil
}

View File

@ -0,0 +1,236 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tainttoleration
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// TaintToleration is a plugin that checks if a pod tolerates a node's taints.
type TaintToleration struct {
handle framework.Handle
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &TaintToleration{}
var _ framework.PreScorePlugin = &TaintToleration{}
var _ framework.ScorePlugin = &TaintToleration{}
var _ framework.EnqueueExtensions = &TaintToleration{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.TaintToleration
// preScoreStateKey is the key in CycleState to TaintToleration pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
// ErrReasonNotMatch is the Filter reason status when not matching.
ErrReasonNotMatch = "node(s) had taints that the pod didn't tolerate"
)
// Name returns name of the plugin. It is used in logs, etc.
func (pl *TaintToleration) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *TaintToleration) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if pl.enableSchedulingQueueHint {
return []framework.ClusterEventWithHint{
// When the QueueingHint feature is enabled, preCheck is eliminated and we don't need additional UpdateNodeLabel.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
return []framework.ClusterEventWithHint{
// A note about UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// No need to register the Pod event; the update to the unschedulable Pods already triggers the scheduling retry when QHint is disabled.
}, nil
}
// isSchedulableAfterNodeChange is invoked for all node events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable.
func (pl *TaintToleration) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
wasUntolerated := true
if originalNode != nil {
_, wasUntolerated = v1helper.FindMatchingUntoleratedTaint(originalNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
}
_, isUntolerated := v1helper.FindMatchingUntoleratedTaint(modifiedNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
if wasUntolerated && !isUntolerated {
logger.V(5).Info("node was created or updated, and this may make the Pod rejected by TaintToleration plugin in the previous scheduling cycle schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
logger.V(5).Info("node was created or updated, but it doesn't change the TaintToleration plugin's decision", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Filter invoked at the filter extension point.
func (pl *TaintToleration) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
taint, isUntolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
if !isUntolerated {
return nil
}
errReason := fmt.Sprintf("node(s) had untolerated taint {%s: %s}", taint.Key, taint.Value)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReason)
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
tolerationsPreferNoSchedule []v1.Toleration
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// getAllTolerationEffectPreferNoSchedule gets the list of all Tolerations with Effect PreferNoSchedule or with no effect.
func getAllTolerationPreferNoSchedule(tolerations []v1.Toleration) (tolerationList []v1.Toleration) {
for _, toleration := range tolerations {
// Empty effect means all effects which includes PreferNoSchedule, so we need to collect it as well.
if len(toleration.Effect) == 0 || toleration.Effect == v1.TaintEffectPreferNoSchedule {
tolerationList = append(tolerationList, toleration)
}
}
return
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *TaintToleration) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
tolerationsPreferNoSchedule := getAllTolerationPreferNoSchedule(pod.Spec.Tolerations)
state := &preScoreState{
tolerationsPreferNoSchedule: tolerationsPreferNoSchedule,
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to tainttoleration.preScoreState error", c)
}
return s, nil
}
// CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
func countIntolerableTaintsPreferNoSchedule(taints []v1.Taint, tolerations []v1.Toleration) (intolerableTaints int) {
for _, taint := range taints {
// check only on taints that have effect PreferNoSchedule
if taint.Effect != v1.TaintEffectPreferNoSchedule {
continue
}
if !v1helper.TolerationsTolerateTaint(tolerations, &taint) {
intolerableTaints++
}
}
return
}
// Score invoked at the Score extension point.
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(state)
if err != nil {
return 0, framework.AsStatus(err)
}
score := int64(countIntolerableTaintsPreferNoSchedule(node.Spec.Taints, s.tolerationsPreferNoSchedule))
return score, nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *TaintToleration) NormalizeScore(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
return helper.DefaultNormalizeScore(framework.MaxNodeScore, true, scores)
}
// ScoreExtensions of the Score plugin.
func (pl *TaintToleration) ScoreExtensions() framework.ScoreExtensions {
return pl
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &TaintToleration{
handle: h,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
func (pl *TaintToleration) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pod.UID == modifiedPod.UID {
// The updated Pod is the unschedulable Pod.
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,131 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"fmt"
v1 "k8s.io/api/core/v1"
storagehelpers "k8s.io/component-helpers/storage/volume"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
// PVAssumeCache is a AssumeCache for PersistentVolume objects
type PVAssumeCache struct {
*assumecache.AssumeCache
logger klog.Logger
}
func pvStorageClassIndexFunc(obj interface{}) ([]string, error) {
if pv, ok := obj.(*v1.PersistentVolume); ok {
return []string{storagehelpers.GetPersistentVolumeClass(pv)}, nil
}
return []string{""}, fmt.Errorf("object is not a v1.PersistentVolume: %v", obj)
}
// NewPVAssumeCache creates a PV assume cache.
func NewPVAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVAssumeCache {
logger = klog.LoggerWithName(logger, "PV Cache")
return &PVAssumeCache{
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolume", "storageclass", pvStorageClassIndexFunc),
logger: logger,
}
}
func (c *PVAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.Get(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
}
return pv, nil
}
func (c *PVAssumeCache) GetAPIPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.GetAPIObj(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
}
return pv, nil
}
func (c *PVAssumeCache) ListPVs(storageClassName string) []*v1.PersistentVolume {
objs := c.List(&v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{
StorageClassName: storageClassName,
},
})
pvs := []*v1.PersistentVolume{}
for _, obj := range objs {
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
c.logger.Error(&assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}, "ListPVs")
continue
}
pvs = append(pvs, pv)
}
return pvs
}
// PVCAssumeCache is a AssumeCache for PersistentVolumeClaim objects
type PVCAssumeCache struct {
*assumecache.AssumeCache
logger klog.Logger
}
// NewPVCAssumeCache creates a PVC assume cache.
func NewPVCAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVCAssumeCache {
logger = klog.LoggerWithName(logger, "PVC Cache")
return &PVCAssumeCache{
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolumeClaim", "", nil),
logger: logger,
}
}
func (c *PVCAssumeCache) GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.Get(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
}
return pvc, nil
}
func (c *PVCAssumeCache) GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.GetAPIObj(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
}
return pvc, nil
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,75 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
)
// FakeVolumeBinderConfig holds configurations for fake volume binder.
type FakeVolumeBinderConfig struct {
AllBound bool
FindReasons ConflictReasons
FindErr error
AssumeErr error
BindErr error
}
// NewFakeVolumeBinder sets up all the caches needed for the scheduler to make
// topology-aware volume binding decisions.
func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder {
return &FakeVolumeBinder{
config: config,
}
}
// FakeVolumeBinder represents a fake volume binder for testing.
type FakeVolumeBinder struct {
config *FakeVolumeBinderConfig
AssumeCalled bool
BindCalled bool
}
var _ SchedulerVolumeBinder = &FakeVolumeBinder{}
// GetPodVolumeClaims implements SchedulerVolumeBinder.GetPodVolumes.
func (b *FakeVolumeBinder) GetPodVolumeClaims(_ klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
return &PodVolumeClaims{}, nil
}
// FindPodVolumes implements SchedulerVolumeBinder.FindPodVolumes.
func (b *FakeVolumeBinder) FindPodVolumes(_ klog.Logger, pod *v1.Pod, _ *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
return nil, b.config.FindReasons, b.config.FindErr
}
// AssumePodVolumes implements SchedulerVolumeBinder.AssumePodVolumes.
func (b *FakeVolumeBinder) AssumePodVolumes(_ klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (bool, error) {
b.AssumeCalled = true
return b.config.AllBound, b.config.AssumeErr
}
// RevertAssumedPodVolumes implements SchedulerVolumeBinder.RevertAssumedPodVolumes
func (b *FakeVolumeBinder) RevertAssumedPodVolumes(_ *PodVolumes) {}
// BindPodVolumes implements SchedulerVolumeBinder.BindPodVolumes.
func (b *FakeVolumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error {
b.BindCalled = true
return b.config.BindErr
}

View File

@ -0,0 +1,55 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// VolumeSchedulerSubsystem - subsystem name used by scheduler
const VolumeSchedulerSubsystem = "scheduler_volume"
var (
// VolumeBindingRequestSchedulerBinderCache tracks the number of volume binder cache operations.
VolumeBindingRequestSchedulerBinderCache = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "binder_cache_requests_total",
Help: "Total number for request volume binding cache",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
// VolumeSchedulingStageFailed tracks the number of failed volume scheduling operations.
VolumeSchedulingStageFailed = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_stage_error_total",
Help: "Volume scheduling stage error count",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)
// RegisterVolumeSchedulingMetrics is used for scheduler, because the volume binding cache is a library
// used by scheduler process.
func RegisterVolumeSchedulingMetrics() {
legacyregistry.MustRegister(VolumeBindingRequestSchedulerBinderCache)
legacyregistry.MustRegister(VolumeSchedulingStageFailed)
}

View File

@ -0,0 +1,54 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"math"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
)
// classResourceMap holds a map of storage class to resource.
type classResourceMap map[string]*StorageResource
// volumeCapacityScorer calculates the score based on class storage resource information.
type volumeCapacityScorer func(classResourceMap) int64
// buildScorerFunction builds volumeCapacityScorer from the scoring function shape.
func buildScorerFunction(scoringFunctionShape helper.FunctionShape) volumeCapacityScorer {
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
f := func(requested, capacity int64) int64 {
if capacity == 0 || requested > capacity {
return rawScoringFunction(maxUtilization)
}
return rawScoringFunction(requested * maxUtilization / capacity)
}
return func(classResources classResourceMap) int64 {
var nodeScore int64
// in alpha stage, all classes have the same weight
weightSum := len(classResources)
if weightSum == 0 {
return 0
}
for _, resource := range classResources {
classScore := f(resource.Requested, resource.Capacity)
nodeScore += classScore
}
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
}
}

View File

@ -0,0 +1,217 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/component-helpers/storage/volume"
"k8s.io/utils/ptr"
)
type nodeBuilder struct {
*v1.Node
}
func makeNode(name string) nodeBuilder {
return nodeBuilder{Node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
v1.LabelHostname: name,
},
},
}}
}
func (nb nodeBuilder) withLabel(key, value string) nodeBuilder {
if nb.Node.ObjectMeta.Labels == nil {
nb.Node.ObjectMeta.Labels = map[string]string{}
}
nb.Node.ObjectMeta.Labels[key] = value
return nb
}
type pvBuilder struct {
*v1.PersistentVolume
}
func makePV(name, className string) pvBuilder {
return pvBuilder{PersistentVolume: &v1.PersistentVolume{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PersistentVolumeSpec{
StorageClassName: className,
},
}}
}
func (pvb pvBuilder) withNodeAffinity(keyValues map[string][]string) pvBuilder {
matchExpressions := make([]v1.NodeSelectorRequirement, 0)
for key, values := range keyValues {
matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{
Key: key,
Operator: v1.NodeSelectorOpIn,
Values: values,
})
}
pvb.PersistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: matchExpressions,
},
},
},
}
return pvb
}
func (pvb pvBuilder) withVersion(version string) pvBuilder {
pvb.PersistentVolume.ObjectMeta.ResourceVersion = version
return pvb
}
func (pvb pvBuilder) withCapacity(capacity resource.Quantity) pvBuilder {
pvb.PersistentVolume.Spec.Capacity = v1.ResourceList{
v1.ResourceName(v1.ResourceStorage): capacity,
}
return pvb
}
func (pvb pvBuilder) withPhase(phase v1.PersistentVolumePhase) pvBuilder {
pvb.PersistentVolume.Status = v1.PersistentVolumeStatus{
Phase: phase,
}
return pvb
}
type pvcBuilder struct {
*v1.PersistentVolumeClaim
}
func makePVC(name string, storageClassName string) pvcBuilder {
return pvcBuilder{PersistentVolumeClaim: &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: v1.NamespaceDefault,
},
Spec: v1.PersistentVolumeClaimSpec{
StorageClassName: ptr.To(storageClassName),
},
}}
}
func (pvcb pvcBuilder) withBoundPV(pvName string) pvcBuilder {
pvcb.PersistentVolumeClaim.Spec.VolumeName = pvName
metav1.SetMetaDataAnnotation(&pvcb.PersistentVolumeClaim.ObjectMeta, volume.AnnBindCompleted, "true")
return pvcb
}
func (pvcb pvcBuilder) withRequestStorage(request resource.Quantity) pvcBuilder {
pvcb.PersistentVolumeClaim.Spec.Resources = v1.VolumeResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceStorage): request,
},
}
return pvcb
}
func (pvcb pvcBuilder) withPhase(phase v1.PersistentVolumeClaimPhase) pvcBuilder {
pvcb.PersistentVolumeClaim.Status = v1.PersistentVolumeClaimStatus{
Phase: phase,
}
return pvcb
}
type podBuilder struct {
*v1.Pod
}
func makePod(name string) podBuilder {
pb := podBuilder{Pod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: v1.NamespaceDefault,
},
}}
pb.Pod.Spec.Volumes = make([]v1.Volume, 0)
return pb
}
func (pb podBuilder) withNodeName(name string) podBuilder {
pb.Pod.Spec.NodeName = name
return pb
}
func (pb podBuilder) withNamespace(name string) podBuilder {
pb.Pod.ObjectMeta.Namespace = name
return pb
}
func (pb podBuilder) withPVCVolume(pvcName, name string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
Name: name,
VolumeSource: v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
},
},
})
return pb
}
func (pb podBuilder) withPVCSVolume(pvcs []*v1.PersistentVolumeClaim) podBuilder {
for i, pvc := range pvcs {
pb.withPVCVolume(pvc.Name, fmt.Sprintf("vol%v", i))
}
return pb
}
func (pb podBuilder) withEmptyDirVolume() podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{},
},
})
return pb
}
func (pb podBuilder) withGenericEphemeralVolume(name string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
Name: name,
VolumeSource: v1.VolumeSource{
Ephemeral: &v1.EphemeralVolumeSource{},
},
})
return pb
}
func (pb podBuilder) withCSI(driver string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
VolumeSource: v1.VolumeSource{
CSI: &v1.CSIVolumeSource{
Driver: driver,
},
},
})
return pb
}

View File

@ -0,0 +1,602 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"context"
"errors"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/component-helpers/storage/ephemeral"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
stateKey framework.StateKey = Name
maxUtilization = 100
)
// the state is initialized in PreFilter phase. because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
allBound bool
// podVolumesByNode holds the pod's volume information found in the Filter
// phase for each node
// it's initialized in the PreFilter phase
podVolumesByNode map[string]*PodVolumes
podVolumeClaims *PodVolumeClaims
// hasStaticBindings declares whether the pod contains one or more StaticBinding.
// If not, vloumeBinding will skip score extension point.
hasStaticBindings bool
sync.Mutex
}
func (d *stateData) Clone() framework.StateData {
return d
}
// VolumeBinding is a plugin that binds pod volumes in scheduling.
// In the Filter phase, pod binding cache is created for the pod and used in
// Reserve and PreBind phases.
type VolumeBinding struct {
Binder SchedulerVolumeBinder
PVCLister corelisters.PersistentVolumeClaimLister
scorer volumeCapacityScorer
fts feature.Features
}
var _ framework.PreFilterPlugin = &VolumeBinding{}
var _ framework.FilterPlugin = &VolumeBinding{}
var _ framework.ReservePlugin = &VolumeBinding{}
var _ framework.PreBindPlugin = &VolumeBinding{}
var _ framework.PreScorePlugin = &VolumeBinding{}
var _ framework.ScorePlugin = &VolumeBinding{}
var _ framework.EnqueueExtensions = &VolumeBinding{}
// Name is the name of the plugin used in Registry and configurations.
const Name = names.VolumeBinding
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeBinding) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeBinding) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// Pods may fail to find available PVs because the node labels do not
// match the storage class's allowed topologies or PV's node affinity.
// A new or updated node may make pods schedulable.
//
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.fts.EnableSchedulingQueueHint {
// When scheduling queue hint is enabled, we don't use the problematic preCheck and don't need to register UpdateNodeTaint event.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
events := []framework.ClusterEventWithHint{
// Pods may fail because of missing or mis-configured storage class
// (e.g., allowedTopologies, volumeBindingMode), and hence may become
// schedulable upon StorageClass Add or Update events.
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterStorageClassChange},
// We bind PVCs with PVs, so any changes may make the pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// We rely on CSI node to translate in-tree PV to CSI.
// TODO: kube-schduler will unregister the CSINode events once all the volume plugins has completed their CSI migration.
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSINodeChange},
// When CSIStorageCapacity is enabled, pods may become schedulable
// on CSI driver & storage capacity changes.
{Event: framework.ClusterEvent{Resource: framework.CSIDriver, ActionType: framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIDriverChange},
{Event: framework.ClusterEvent{Resource: framework.CSIStorageCapacity, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIStorageCapacityChange},
}
return events, nil
}
func (pl *VolumeBinding) isSchedulableAfterCSINodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
if oldObj == nil {
logger.V(5).Info("CSINode creation could make the pod schedulable")
return framework.Queue, nil
}
oldCSINode, modifiedCSINode, err := util.As[*storagev1.CSINode](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSINode", klog.KObj(modifiedCSINode),
)
if oldCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] != modifiedCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] {
logger.V(5).Info("CSINode's migrated plugins annotation is updated and that may make the pod schedulable")
return framework.Queue, nil
}
logger.V(5).Info("CISNode was created or updated but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
func (pl *VolumeBinding) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, newPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"PersistentVolumeClaim", klog.KObj(newPVC),
)
if pod.Namespace != newPVC.Namespace {
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable because the PVC belongs to a different namespace")
return framework.QueueSkip, nil
}
for _, vol := range pod.Spec.Volumes {
var pvcName string
switch {
case vol.PersistentVolumeClaim != nil:
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &vol)
default:
continue
}
if pvcName == newPVC.Name {
// Return Queue because, in this case,
// all PVC creations and almost all PVC updates could make the Pod schedulable.
logger.V(5).Info("PersistentVolumeClaim the pod requires was created or updated, potentially making the target Pod schedulable")
return framework.Queue, nil
}
}
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// isSchedulableAfterStorageClassChange checks whether an StorageClass event might make a Pod schedulable or not.
// Any StorageClass addition and a StorageClass update to allowedTopologies
// might make a Pod schedulable.
// Note that an update to volume binding mode is not allowed and we don't have to consider while examining the update event.
func (pl *VolumeBinding) isSchedulableAfterStorageClassChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
oldSC, newSC, err := util.As[*storagev1.StorageClass](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"StorageClass", klog.KObj(newSC),
)
if oldSC == nil {
// No further filtering can be made for a creation event,
// and we just always return Queue.
logger.V(5).Info("A new StorageClass was created, which could make a Pod schedulable")
return framework.Queue, nil
}
if !apiequality.Semantic.DeepEqual(newSC.AllowedTopologies, oldSC.AllowedTopologies) {
logger.V(5).Info("StorageClass got an update in AllowedTopologies", "AllowedTopologies", newSC.AllowedTopologies)
return framework.Queue, nil
}
logger.V(5).Info("StorageClass was updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// isSchedulableAfterCSIStorageCapacityChange checks whether a CSIStorageCapacity event
// might make a Pod schedulable or not.
// Any CSIStorageCapacity addition and a CSIStorageCapacity update to volume limit
// (calculated based on capacity and maximumVolumeSize) might make a Pod schedulable.
// Note that an update to nodeTopology and storageClassName is not allowed and
// we don't have to consider while examining the update event.
func (pl *VolumeBinding) isSchedulableAfterCSIStorageCapacityChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
oldCap, newCap, err := util.As[*storagev1.CSIStorageCapacity](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if oldCap == nil {
logger.V(5).Info(
"A new CSIStorageCapacity was created, which could make a Pod schedulable",
"Pod", klog.KObj(pod),
"CSIStorageCapacity", klog.KObj(newCap),
)
return framework.Queue, nil
}
oldLimit := volumeLimit(oldCap)
newLimit := volumeLimit(newCap)
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSIStorageCapacity", klog.KObj(newCap),
"volumeLimit(new)", newLimit,
"volumeLimit(old)", oldLimit,
)
if newLimit != nil && (oldLimit == nil || newLimit.Value() > oldLimit.Value()) {
logger.V(5).Info("VolumeLimit was increased, which could make a Pod schedulable")
return framework.Queue, nil
}
logger.V(5).Info("CSIStorageCapacity was updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
func (pl *VolumeBinding) isSchedulableAfterCSIDriverChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalCSIDriver, modifiedCSIDriver, err := util.As[*storagev1.CSIDriver](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSIDriver", klog.KObj(modifiedCSIDriver),
)
for _, vol := range pod.Spec.Volumes {
if vol.CSI == nil || vol.CSI.Driver != modifiedCSIDriver.Name {
continue
}
if (originalCSIDriver.Spec.StorageCapacity != nil && *originalCSIDriver.Spec.StorageCapacity) &&
(modifiedCSIDriver.Spec.StorageCapacity == nil || !*modifiedCSIDriver.Spec.StorageCapacity) {
logger.V(5).Info("CSIDriver was updated and storage capacity got disabled, which may make the pod schedulable")
return framework.Queue, nil
}
}
logger.V(5).Info("CSIDriver was created or updated but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// podHasPVCs returns 2 values:
// - the first one to denote if the given "pod" has any PVC defined.
// - the second one to return any error if the requested PVC is illegal.
func (pl *VolumeBinding) podHasPVCs(pod *v1.Pod) (bool, error) {
hasPVC := false
for _, vol := range pod.Spec.Volumes {
var pvcName string
isEphemeral := false
switch {
case vol.PersistentVolumeClaim != nil:
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &vol)
isEphemeral = true
default:
// Volume is not using a PVC, ignore
continue
}
hasPVC = true
pvc, err := pl.PVCLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if err != nil {
// The error usually has already enough context ("persistentvolumeclaim "myclaim" not found"),
// but we can do better for generic ephemeral inline volumes where that situation
// is normal directly after creating a pod.
if isEphemeral && apierrors.IsNotFound(err) {
err = fmt.Errorf("waiting for ephemeral volume controller to create the persistentvolumeclaim %q", pvcName)
}
return hasPVC, err
}
if pvc.Status.Phase == v1.ClaimLost {
return hasPVC, fmt.Errorf("persistentvolumeclaim %q bound to non-existent persistentvolume %q", pvc.Name, pvc.Spec.VolumeName)
}
if pvc.DeletionTimestamp != nil {
return hasPVC, fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name)
}
if isEphemeral {
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
return hasPVC, err
}
}
}
return hasPVC, nil
}
// PreFilter invoked at the prefilter extension point to check if pod has all
// immediate PVCs bound. If not all immediate PVCs are bound, an
// UnschedulableAndUnresolvable is returned.
func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
// If pod does not reference any PVC, we don't need to do anything.
if hasPVC, err := pl.podHasPVCs(pod); err != nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
} else if !hasPVC {
state.Write(stateKey, &stateData{})
return nil, framework.NewStatus(framework.Skip)
}
podVolumeClaims, err := pl.Binder.GetPodVolumeClaims(logger, pod)
if err != nil {
return nil, framework.AsStatus(err)
}
if len(podVolumeClaims.unboundClaimsImmediate) > 0 {
// Return UnschedulableAndUnresolvable error if immediate claims are
// not bound. Pod will be moved to active/backoff queues once these
// claims are bound by PV controller.
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
status.AppendReason("pod has unbound immediate PersistentVolumeClaims")
return nil, status
}
state.Write(stateKey, &stateData{
podVolumesByNode: make(map[string]*PodVolumes),
podVolumeClaims: &PodVolumeClaims{
boundClaims: podVolumeClaims.boundClaims,
unboundClaimsDelayBinding: podVolumeClaims.unboundClaimsDelayBinding,
unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding,
},
})
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeBinding) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(stateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the volumes it requests,
// for both bound and unbound PVCs.
//
// For PVCs that are bound, then it checks that the corresponding PV's node affinity is
// satisfied by the given node.
//
// For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements
// and that the PV node affinity is satisfied by the given node.
//
// If storage capacity tracking is enabled, then enough space has to be available
// for the node and volumes that still need to be created.
//
// The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound
// PVCs can be matched with an available and node-compatible PV.
func (pl *VolumeBinding) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
podVolumes, reasons, err := pl.Binder.FindPodVolumes(logger, pod, state.podVolumeClaims, node)
if err != nil {
return framework.AsStatus(err)
}
if len(reasons) > 0 {
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
for _, reason := range reasons {
status.AppendReason(string(reason))
}
return status
}
// multiple goroutines call `Filter` on different nodes simultaneously and the `CycleState` may be duplicated, so we must use a local lock here
state.Lock()
state.podVolumesByNode[node.Name] = podVolumes
state.hasStaticBindings = state.hasStaticBindings || (podVolumes != nil && len(podVolumes.StaticBindings) > 0)
state.Unlock()
return nil
}
// PreScore invoked at the preScore extension point. It checks whether volumeBinding can skip Score
func (pl *VolumeBinding) PreScore(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if pl.scorer == nil {
return framework.NewStatus(framework.Skip)
}
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
if state.hasStaticBindings {
return nil
}
return framework.NewStatus(framework.Skip)
}
// Score invoked at the score extension point.
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
if pl.scorer == nil {
return 0, nil
}
state, err := getStateData(cs)
if err != nil {
return 0, framework.AsStatus(err)
}
podVolumes, ok := state.podVolumesByNode[nodeName]
if !ok {
return 0, nil
}
// group by storage class
classResources := make(classResourceMap)
for _, staticBinding := range podVolumes.StaticBindings {
class := staticBinding.StorageClassName()
storageResource := staticBinding.StorageResource()
if _, ok := classResources[class]; !ok {
classResources[class] = &StorageResource{
Requested: 0,
Capacity: 0,
}
}
classResources[class].Requested += storageResource.Requested
classResources[class].Capacity += storageResource.Capacity
}
return pl.scorer(classResources), nil
}
// ScoreExtensions of the Score plugin.
func (pl *VolumeBinding) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// Reserve reserves volumes of pod and saves binding status in cycle state.
func (pl *VolumeBinding) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
// we don't need to hold the lock as only one node will be reserved for the given pod
podVolumes, ok := state.podVolumesByNode[nodeName]
if ok {
allBound, err := pl.Binder.AssumePodVolumes(klog.FromContext(ctx), pod, nodeName, podVolumes)
if err != nil {
return framework.AsStatus(err)
}
state.allBound = allBound
} else {
// may not exist if the pod does not reference any PVC
state.allBound = true
}
return nil
}
// PreBind will make the API update with the assumed bindings and wait until
// the PV controller has completely finished the binding operation.
//
// If binding errors, times out or gets undone, then an error will be returned to
// retry scheduling.
func (pl *VolumeBinding) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
s, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
if s.allBound {
// no need to bind volumes
return nil
}
// we don't need to hold the lock as only one node will be pre-bound for the given pod
podVolumes, ok := s.podVolumesByNode[nodeName]
if !ok {
return framework.AsStatus(fmt.Errorf("no pod volumes found for node %q", nodeName))
}
logger := klog.FromContext(ctx)
logger.V(5).Info("Trying to bind volumes for pod", "pod", klog.KObj(pod))
err = pl.Binder.BindPodVolumes(ctx, pod, podVolumes)
if err != nil {
logger.V(5).Info("Failed to bind volumes for pod", "pod", klog.KObj(pod), "err", err)
return framework.AsStatus(err)
}
logger.V(5).Info("Success binding volumes for pod", "pod", klog.KObj(pod))
return nil
}
// Unreserve clears assumed PV and PVC cache.
// It's idempotent, and does nothing if no cache found for the given pod.
func (pl *VolumeBinding) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
s, err := getStateData(cs)
if err != nil {
return
}
// we don't need to hold the lock as only one node may be unreserved
podVolumes, ok := s.podVolumesByNode[nodeName]
if !ok {
return
}
pl.Binder.RevertAssumedPodVolumes(podVolumes)
}
// New initializes a new plugin and returns it.
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := plArgs.(*config.VolumeBindingArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
}
if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
}); err != nil {
return nil, err
}
podInformer := fh.SharedInformerFactory().Core().V1().Pods()
nodeInformer := fh.SharedInformerFactory().Core().V1().Nodes()
pvcInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumeClaims()
pvInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumes()
storageClassInformer := fh.SharedInformerFactory().Storage().V1().StorageClasses()
csiNodeInformer := fh.SharedInformerFactory().Storage().V1().CSINodes()
capacityCheck := CapacityCheck{
CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1().CSIStorageCapacities(),
}
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
// build score function
var scorer volumeCapacityScorer
if fts.EnableVolumeCapacityPriority {
shape := make(helper.FunctionShape, 0, len(args.Shape))
for _, point := range args.Shape {
shape = append(shape, helper.FunctionShapePoint{
Utilization: int64(point.Utilization),
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
})
}
scorer = buildScorerFunction(shape)
}
return &VolumeBinding{
Binder: binder,
PVCLister: pvcInformer.Lister(),
scorer: scorer,
fts: fts,
}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,426 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumerestrictions
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// VolumeRestrictions is a plugin that checks volume restrictions.
type VolumeRestrictions struct {
pvcLister corelisters.PersistentVolumeClaimLister
sharedLister framework.SharedLister
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &VolumeRestrictions{}
var _ framework.FilterPlugin = &VolumeRestrictions{}
var _ framework.EnqueueExtensions = &VolumeRestrictions{}
var _ framework.StateData = &preFilterState{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.VolumeRestrictions
// preFilterStateKey is the key in CycleState to VolumeRestrictions pre-computed data for Filtering.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReasonDiskConflict is used for NoDiskConflict predicate error.
ErrReasonDiskConflict = "node(s) had no available disk"
// ErrReasonReadWriteOncePodConflict is used when a pod is found using the same PVC with the ReadWriteOncePod access mode.
ErrReasonReadWriteOncePodConflict = "node has pod using PersistentVolumeClaim with the same name and ReadWriteOncePod access mode"
)
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
// Names of the pod's volumes using the ReadWriteOncePod access mode.
readWriteOncePodPVCs sets.Set[string]
// The number of references to these ReadWriteOncePod volumes by scheduled pods.
conflictingPVCRefCount int
}
func (s *preFilterState) updateWithPod(podInfo *framework.PodInfo, multiplier int) {
s.conflictingPVCRefCount += multiplier * s.conflictingPVCRefCountForPod(podInfo)
}
func (s *preFilterState) conflictingPVCRefCountForPod(podInfo *framework.PodInfo) int {
conflicts := 0
for _, volume := range podInfo.Pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
if s.readWriteOncePodPVCs.Has(volume.PersistentVolumeClaim.ClaimName) {
conflicts += 1
}
}
return conflicts
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
return &preFilterState{
readWriteOncePodPVCs: s.readWriteOncePodPVCs,
conflictingPVCRefCount: s.conflictingPVCRefCount,
}
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeRestrictions) Name() string {
return Name
}
func isVolumeConflict(volume *v1.Volume, pod *v1.Pod) bool {
for _, existingVolume := range pod.Spec.Volumes {
// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
return true
}
}
if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
return true
}
}
if volume.ISCSI != nil && existingVolume.ISCSI != nil {
iqn := volume.ISCSI.IQN
eiqn := existingVolume.ISCSI.IQN
// two ISCSI volumes are same, if they share the same iqn. As iscsi volumes are of type
// RWO or ROX, we could permit only one RW mount. Same iscsi volume mounted by multiple Pods
// conflict unless all other pods mount as read only.
if iqn == eiqn && !(volume.ISCSI.ReadOnly && existingVolume.ISCSI.ReadOnly) {
return true
}
}
if volume.RBD != nil && existingVolume.RBD != nil {
mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
// two RBDs images are the same if they share the same Ceph monitor, are in the same RADOS Pool, and have the same image name
// only one read-write mount is permitted for the same RBD image.
// same RBD image mounted by multiple Pods conflicts unless all Pods mount the image read-only
if haveOverlap(mon, emon) && pool == epool && image == eimage && !(volume.RBD.ReadOnly && existingVolume.RBD.ReadOnly) {
return true
}
}
}
return false
}
// haveOverlap searches two arrays and returns true if they have at least one common element; returns false otherwise.
func haveOverlap(a1, a2 []string) bool {
if len(a1) > len(a2) {
a1, a2 = a2, a1
}
m := sets.New(a1...)
for _, val := range a2 {
if _, ok := m[val]; ok {
return true
}
}
return false
}
// return true if there are conflict checking targets.
func needsRestrictionsCheck(v v1.Volume) bool {
return v.GCEPersistentDisk != nil || v.AWSElasticBlockStore != nil || v.RBD != nil || v.ISCSI != nil
}
// PreFilter computes and stores cycleState containing details for enforcing ReadWriteOncePod.
func (pl *VolumeRestrictions) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
needsCheck := false
for i := range pod.Spec.Volumes {
if needsRestrictionsCheck(pod.Spec.Volumes[i]) {
needsCheck = true
break
}
}
pvcs, err := pl.readWriteOncePodPVCsForPod(ctx, pod)
if err != nil {
if apierrors.IsNotFound(err) {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return nil, framework.AsStatus(err)
}
s, err := pl.calPreFilterState(ctx, pod, pvcs)
if err != nil {
return nil, framework.AsStatus(err)
}
if !needsCheck && s.conflictingPVCRefCount == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// AddPod from pre-computed data in cycleState.
func (pl *VolumeRestrictions) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToAdd, 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *VolumeRestrictions) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToRemove, -1)
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("cannot read %q from cycleState", preFilterStateKey)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to volumerestrictions.state error", c)
}
return s, nil
}
// calPreFilterState computes preFilterState describing which PVCs use ReadWriteOncePod
// and which pods in the cluster are in conflict.
func (pl *VolumeRestrictions) calPreFilterState(ctx context.Context, pod *v1.Pod, pvcs sets.Set[string]) (*preFilterState, error) {
conflictingPVCRefCount := 0
for pvc := range pvcs {
key := framework.GetNamespacedName(pod.Namespace, pvc)
if pl.sharedLister.StorageInfos().IsPVCUsedByPods(key) {
// There can only be at most one pod using the ReadWriteOncePod PVC.
conflictingPVCRefCount += 1
}
}
return &preFilterState{
readWriteOncePodPVCs: pvcs,
conflictingPVCRefCount: conflictingPVCRefCount,
}, nil
}
func (pl *VolumeRestrictions) readWriteOncePodPVCsForPod(ctx context.Context, pod *v1.Pod) (sets.Set[string], error) {
pvcs := sets.New[string]()
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
if err != nil {
return nil, err
}
if !v1helper.ContainsAccessMode(pvc.Spec.AccessModes, v1.ReadWriteOncePod) {
continue
}
pvcs.Insert(pvc.Name)
}
return pvcs, nil
}
// Checks if scheduling the pod onto this node would cause any conflicts with
// existing volumes.
func satisfyVolumeConflicts(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
for i := range pod.Spec.Volumes {
v := pod.Spec.Volumes[i]
if !needsRestrictionsCheck(v) {
continue
}
for _, ev := range nodeInfo.Pods {
if isVolumeConflict(&v, ev.Pod) {
return false
}
}
}
return true
}
// Checks if scheduling the pod would cause any ReadWriteOncePod PVC access mode conflicts.
func satisfyReadWriteOncePod(ctx context.Context, state *preFilterState) *framework.Status {
if state == nil {
return nil
}
if state.conflictingPVCRefCount > 0 {
return framework.NewStatus(framework.Unschedulable, ErrReasonReadWriteOncePodConflict)
}
return nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeRestrictions) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the volumes it requests, and those that
// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
// can't be scheduled there.
// This is GCE, Amazon EBS, ISCSI and Ceph RBD specific for now:
// - GCE PD allows multiple mounts as long as they're all read-only
// - AWS EBS forbids any two pods mounting the same volume ID
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image, and the image is read-only
// - ISCSI forbids if any two pods share at least same IQN and ISCSI volume is read-only
// If the pod uses PVCs with the ReadWriteOncePod access mode, it evaluates if
// these PVCs are already in-use and if preemption will help.
func (pl *VolumeRestrictions) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !satisfyVolumeConflicts(pod, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonDiskConflict)
}
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
return satisfyReadWriteOncePod(ctx, state)
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeRestrictions) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// Pods may fail to schedule because of volumes conflicting with other pods on same node.
// Once running pods are deleted and volumes have been released, the unschedulable pod will be schedulable.
// Due to immutable fields `spec.volumes`, pod update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
// A new Node may make a pod schedulable.
// We intentionally don't set QueueingHint since all Node/Add events could make Pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// Pods may fail to schedule because the PVC it uses has not yet been created.
// This PVC is required to exist to check its access modes.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add},
QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimAdded},
}, nil
}
// isSchedulableAfterPersistentVolumeClaimAdded is invoked whenever a PersistentVolumeClaim added or changed, It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *VolumeRestrictions) isSchedulableAfterPersistentVolumeClaimAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, newPersistentVolumeClaim, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
}
if newPersistentVolumeClaim.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
if volume.PersistentVolumeClaim.ClaimName == newPersistentVolumeClaim.Name {
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
return framework.Queue, nil
}
}
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
return framework.QueueSkip, nil
}
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted,
// It checks whether the deleted pod will conflict with volumes of other pods on the same node
func (pl *VolumeRestrictions) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
}
if deletedPod.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
nodeInfo := framework.NewNodeInfo(deletedPod)
if !satisfyVolumeConflicts(pod, nodeInfo) {
logger.V(5).Info("Pod with the volume that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
// Return Queue if a deleted pod uses the same PVC since the pod may be unschedulable due to the ReadWriteOncePod access mode of the PVC.
//
// For now, we don't actually fetch PVC and check the access mode because that operation could be expensive.
// Once the observability around QHint is established,
// we may want to do that depending on how much the operation would impact the QHint latency negatively.
// https://github.com/kubernetes/kubernetes/issues/124566
claims := sets.New[string]()
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil {
claims.Insert(volume.PersistentVolumeClaim.ClaimName)
}
}
for _, volume := range deletedPod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil && claims.Has(volume.PersistentVolumeClaim.ClaimName) {
logger.V(5).Info("Pod with the same PVC that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
}
logger.V(5).Info("An irrelevant Pod was deleted, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
sharedLister := handle.SnapshotSharedLister()
return &VolumeRestrictions{
pvcLister: pvcLister,
sharedLister: sharedLister,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,410 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumezone
import (
"context"
"errors"
"fmt"
"reflect"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
storagelisters "k8s.io/client-go/listers/storage/v1"
volumehelpers "k8s.io/cloud-provider/volume/helpers"
storagehelpers "k8s.io/component-helpers/storage/volume"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// VolumeZone is a plugin that checks volume zone.
type VolumeZone struct {
pvLister corelisters.PersistentVolumeLister
pvcLister corelisters.PersistentVolumeClaimLister
scLister storagelisters.StorageClassLister
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &VolumeZone{}
var _ framework.PreFilterPlugin = &VolumeZone{}
var _ framework.EnqueueExtensions = &VolumeZone{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.VolumeZone
preFilterStateKey framework.StateKey = "PreFilter" + Name
// ErrReasonConflict is used for NoVolumeZoneConflict predicate error.
ErrReasonConflict = "node(s) had no available volume zone"
)
// pvTopology holds the value of a pv's topologyLabel
type pvTopology struct {
pvName string
key string
values sets.Set[string]
}
// the state is initialized in PreFilter phase. because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
// podPVTopologies holds the pv information we need
// it's initialized in the PreFilter phase
podPVTopologies []pvTopology
}
func (d *stateData) Clone() framework.StateData {
return d
}
var topologyLabels = []string{
v1.LabelFailureDomainBetaZone,
v1.LabelFailureDomainBetaRegion,
v1.LabelTopologyZone,
v1.LabelTopologyRegion,
}
func translateToGALabel(label string) string {
if label == v1.LabelFailureDomainBetaRegion {
return v1.LabelTopologyRegion
}
if label == v1.LabelFailureDomainBetaZone {
return v1.LabelTopologyZone
}
return label
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeZone) Name() string {
return Name
}
// PreFilter invoked at the prefilter extension point
//
// # It finds the topology of the PersistentVolumes corresponding to the volumes a pod requests
//
// Currently, this is only supported with PersistentVolumeClaims,
// and only looks for the bound PersistentVolume.
func (pl *VolumeZone) PreFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
podPVTopologies, status := pl.getPVbyPod(logger, pod)
if !status.IsSuccess() {
return nil, status
}
if len(podPVTopologies) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cs.Write(preFilterStateKey, &stateData{podPVTopologies: podPVTopologies})
return nil, nil
}
// getPVbyPod gets PVTopology from pod
func (pl *VolumeZone) getPVbyPod(logger klog.Logger, pod *v1.Pod) ([]pvTopology, *framework.Status) {
podPVTopologies := make([]pvTopology, 0)
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
for _, pvcName := range pvcNames {
if pvcName == "" {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no name")
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
pvName := pvc.Spec.VolumeName
if pvName == "" {
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
if len(scName) == 0 {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no pv name and storageClass name")
}
class, err := pl.scLister.Get(scName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
if class.VolumeBindingMode == nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("VolumeBindingMode not set for StorageClass %q", scName))
}
if *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer {
// Skip unbound volumes
continue
}
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolume had no name")
}
pv, err := pl.pvLister.Get(pvName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
podPVTopologies = append(podPVTopologies, pl.getPVTopologies(logger, pv)...)
}
return podPVTopologies, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeZone) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter invoked at the filter extension point.
//
// It evaluates if a pod can fit due to the volumes it requests, given
// that some volumes may have zone scheduling constraints. The requirement is that any
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
// the node to have more zone-label constraints (for example, a hypothetical replicated
// volume might allow region-wide access)
//
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
// only on the bound PersistentVolume.
//
// Working with volumes declared inline in the pod specification (i.e. not
// using a PersistentVolume) is likely to be harder, as it would require
// determining the zone of a volume during scheduling, and that is likely to
// require calling out to the cloud provider. It seems that we are moving away
// from inline volume declarations anyway.
func (pl *VolumeZone) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
logger := klog.FromContext(ctx)
// If a pod doesn't have any volume attached to it, the predicate will always be true.
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
if len(pod.Spec.Volumes) == 0 {
return nil
}
var podPVTopologies []pvTopology
state, err := getStateData(cs)
if err != nil {
// Fallback to calculate pv list here
var status *framework.Status
podPVTopologies, status = pl.getPVbyPod(logger, pod)
if !status.IsSuccess() {
return status
}
} else {
podPVTopologies = state.podPVTopologies
}
node := nodeInfo.Node()
hasAnyNodeConstraint := false
for _, topologyLabel := range topologyLabels {
if _, ok := node.Labels[topologyLabel]; ok {
hasAnyNodeConstraint = true
break
}
}
if !hasAnyNodeConstraint {
// The node has no zone constraints, so we're OK to schedule.
// This is to handle a single-zone cluster scenario where the node may not have any topology labels.
return nil
}
for _, pvTopology := range podPVTopologies {
v, ok := node.Labels[pvTopology.key]
if !ok {
// if we can't match the beta label, try to match pv's beta label with node's ga label
v, ok = node.Labels[translateToGALabel(pvTopology.key)]
}
if !ok || !pvTopology.values.Has(v) {
logger.V(10).Info("Won't schedule pod onto node due to volume (mismatch on label key)", "pod", klog.KObj(pod), "node", klog.KObj(node), "PV", klog.KRef("", pvTopology.pvName), "PVLabelKey", pvTopology.key)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonConflict)
}
}
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(preFilterStateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
func getErrorAsStatus(err error) *framework.Status {
if err != nil {
if apierrors.IsNotFound(err) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return framework.AsStatus(err)
}
return nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeZone) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A new node or updating a node's volume zone labels may make a pod schedulable.
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
// New storageClass with bind mode `VolumeBindingWaitForFirstConsumer` will make a pod schedulable.
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterStorageClassAdded},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// A new pvc may make a pod schedulable.
// Also, if pvc's VolumeName is filled, that also could make a pod schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
// A new pv or updating a pv's volume zone labels may make a pod schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeChange},
}, nil
}
// getPersistentVolumeClaimNameFromPod gets pvc names bound to a pod.
func (pl *VolumeZone) getPersistentVolumeClaimNameFromPod(pod *v1.Pod) []string {
var pvcNames []string
for i := range pod.Spec.Volumes {
volume := pod.Spec.Volumes[i]
if volume.PersistentVolumeClaim == nil {
continue
}
pvcName := volume.PersistentVolumeClaim.ClaimName
pvcNames = append(pvcNames, pvcName)
}
return pvcNames
}
// isSchedulableAfterPersistentVolumeClaimChange is invoked whenever a PersistentVolumeClaim added or updated.
// It checks whether the change of PVC has made a previously unschedulable pod schedulable.
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
}
if pl.isPVCRequestedFromPod(logger, modifiedPVC, pod) {
logger.V(5).Info("PVC that is referred from the pod was created or updated, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
return framework.Queue, nil
}
logger.V(5).Info("PVC irrelevant to the Pod was created or updated, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
return framework.QueueSkip, nil
}
// isPVCRequestedFromPod verifies if the PVC is requested from a given Pod.
func (pl *VolumeZone) isPVCRequestedFromPod(logger klog.Logger, pvc *v1.PersistentVolumeClaim, pod *v1.Pod) bool {
if (pvc == nil) || (pod.Namespace != pvc.Namespace) {
return false
}
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
for _, pvcName := range pvcNames {
if pvc.Name == pvcName {
logger.V(5).Info("PVC is referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
return true
}
}
logger.V(5).Info("PVC is not referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
return false
}
// isSchedulableAfterStorageClassAdded is invoked whenever a StorageClass is added.
// It checks whether the addition of StorageClass has made a previously unschedulable pod schedulable.
// Only a new StorageClass with WaitForFirstConsumer will cause a pod to become schedulable.
func (pl *VolumeZone) isSchedulableAfterStorageClassAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, addedStorageClass, err := util.As[*storage.StorageClass](nil, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterStorageClassAdded: %w", err)
}
if (addedStorageClass.VolumeBindingMode == nil) || (*addedStorageClass.VolumeBindingMode != storage.VolumeBindingWaitForFirstConsumer) {
logger.V(5).Info("StorageClass is created, but its VolumeBindingMode is not waitForFirstConsumer, which doesn't make the pod schedulable", "storageClass", klog.KObj(addedStorageClass), "pod", klog.KObj(pod))
return framework.QueueSkip, nil
}
logger.V(5).Info("StorageClass with waitForFirstConsumer mode was created and it might make this pod schedulable", "pod", klog.KObj(pod), "StorageClass", klog.KObj(addedStorageClass))
return framework.Queue, nil
}
// isSchedulableAfterPersistentVolumeChange is invoked whenever a PersistentVolume added or updated.
// It checks whether the change of PV has made a previously unschedulable pod schedulable.
// Changing the PV topology labels could cause the pod to become schedulable.
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPV, modifiedPV, err := util.As[*v1.PersistentVolume](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeChange: %w", err)
}
if originalPV == nil {
logger.V(5).Info("PV is newly created, which might make the pod schedulable")
return framework.Queue, nil
}
originalPVTopologies := pl.getPVTopologies(logger, originalPV)
modifiedPVTopologies := pl.getPVTopologies(logger, modifiedPV)
if !reflect.DeepEqual(originalPVTopologies, modifiedPVTopologies) {
logger.V(5).Info("PV's topology was updated, which might make the pod schedulable.", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
return framework.Queue, nil
}
logger.V(5).Info("PV was updated, but the topology is unchanged, which it doesn't make the pod schedulable", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
return framework.QueueSkip, nil
}
// getPVTopologies retrieves pvTopology from a given PV and returns the array
// This function doesn't check spec.nodeAffinity
// because it's read-only after creation and thus cannot be updated
// and nodeAffinity is being handled in node affinity plugin
func (pl *VolumeZone) getPVTopologies(logger klog.Logger, pv *v1.PersistentVolume) []pvTopology {
podPVTopologies := make([]pvTopology, 0)
for _, key := range topologyLabels {
if value, ok := pv.ObjectMeta.Labels[key]; ok {
labelZonesSet, err := volumehelpers.LabelZonesToSet(value)
if err != nil {
logger.V(5).Info("failed to parse PV's topology label, ignoring the label", "label", fmt.Sprintf("%s:%s", key, value), "err", err)
continue
}
podPVTopologies = append(podPVTopologies, pvTopology{
pvName: pv.Name,
key: key,
values: sets.Set[string](labelZonesSet),
})
}
}
return podPVTopologies
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
return &VolumeZone{
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,738 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package preemption
import (
"context"
"errors"
"fmt"
"math"
"sync"
"sync/atomic"
"time"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Candidate represents a nominated node on which the preemptor can be scheduled,
// along with the list of victims that should be evicted for the preemptor to fit the node.
type Candidate interface {
// Victims wraps a list of to-be-preempted Pods and the number of PDB violation.
Victims() *extenderv1.Victims
// Name returns the target node name where the preemptor gets nominated to run.
Name() string
}
type candidate struct {
victims *extenderv1.Victims
name string
}
// Victims returns s.victims.
func (s *candidate) Victims() *extenderv1.Victims {
return s.victims
}
// Name returns s.name.
func (s *candidate) Name() string {
return s.name
}
type candidateList struct {
idx int32
items []Candidate
}
func newCandidateList(size int32) *candidateList {
return &candidateList{idx: -1, items: make([]Candidate, size)}
}
// add adds a new candidate to the internal array atomically.
func (cl *candidateList) add(c *candidate) {
if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) {
cl.items[idx] = c
}
}
// size returns the number of candidate stored. Note that some add() operations
// might still be executing when this is called, so care must be taken to
// ensure that all add() operations complete before accessing the elements of
// the list.
func (cl *candidateList) size() int32 {
n := atomic.LoadInt32(&cl.idx) + 1
if n >= int32(len(cl.items)) {
n = int32(len(cl.items))
}
return n
}
// get returns the internal candidate array. This function is NOT atomic and
// assumes that all add() operations have been completed.
func (cl *candidateList) get() []Candidate {
return cl.items[:cl.size()]
}
// Interface is expected to be implemented by different preemption plugins as all those member
// methods might have different behavior compared with the default preemption.
type Interface interface {
// GetOffsetAndNumCandidates chooses a random offset and calculates the number of candidates that should be
// shortlisted for dry running preemption.
GetOffsetAndNumCandidates(nodes int32) (int32, int32)
// CandidatesToVictimsMap builds a map from the target node to a list of to-be-preempted Pods and the number of PDB violation.
CandidatesToVictimsMap(candidates []Candidate) map[string]*extenderv1.Victims
// PodEligibleToPreemptOthers returns one bool and one string. The bool indicates whether this pod should be considered for
// preempting other pods or not. The string includes the reason if this pod isn't eligible.
PodEligibleToPreemptOthers(ctx context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string)
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
// for "pod" to be scheduled.
// Note that both `state` and `nodeInfo` are deep copied.
SelectVictimsOnNode(ctx context.Context, state *framework.CycleState,
pod *v1.Pod, nodeInfo *framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status)
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
// The ordered score functions will be processed one by one iff we find more than one node with the highest score.
// Default score functions will be processed if nil returned here for backwards-compatibility.
OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64
}
type Evaluator struct {
PluginName string
Handler framework.Handle
PodLister corelisters.PodLister
PdbLister policylisters.PodDisruptionBudgetLister
enableAsyncPreemption bool
mu sync.RWMutex
// preempting is a set that records the pods that are currently triggering preemption asynchronously,
// which is used to prevent the pods from entering the scheduling cycle meanwhile.
preempting sets.Set[types.UID]
// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
// This is exposed to be replaced during tests.
PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
Interface
}
func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
ev := &Evaluator{
PluginName: names.DefaultPreemption,
Handler: fh,
PodLister: podLister,
PdbLister: pdbLister,
Interface: i,
enableAsyncPreemption: enableAsyncPreemption,
preempting: sets.New[types.UID](),
}
// PreemptPod actually makes API calls to preempt a specific Pod.
//
// We implement it here directly, rather than creating a separate method like ev.preemptPod(...)
// to prevent the misuse of the PreemptPod function.
ev.PreemptPod = func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
logger := klog.FromContext(ctx)
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
// Otherwise we should delete the victim.
if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
waitingPod.Reject(pluginName, "preempted")
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
} else {
condition := &v1.PodCondition{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
Reason: v1.PodReasonPreemptionByScheduler,
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
}
newStatus := victim.Status.DeepCopy()
updated := apipod.UpdatePodCondition(newStatus, condition)
if updated {
if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
return err
}
}
if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
if !apierrors.IsNotFound(err) {
logger.Error(err, "Tried to preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
return err
}
logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
return nil
}
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
}
ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
return nil
}
return ev
}
// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
ev.mu.RLock()
defer ev.mu.RUnlock()
return ev.preempting.Has(podUID)
}
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
//
// - <nil, Error>. This denotes it's a transient/rare error that may be self-healed in future cycles.
//
// - <nil, Unschedulable>. This status is mostly as expected like the preemptor is waiting for the
// victims to be fully terminated.
//
// - In both cases above, a nil PostFilterResult is returned to keep the pod's nominatedNodeName unchanged.
//
// - <non-nil PostFilterResult, Unschedulable>. It indicates the pod cannot be scheduled even with preemption.
// In this case, a non-nil PostFilterResult is returned and result.NominatingMode instructs how to deal with
// the nominatedNodeName.
//
// - <non-nil PostFilterResult, Success>. It's the regular happy path
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
// 0) Fetch the latest version of <pod>.
// It's safe to directly fetch pod here. Because the informer cache has already been
// initialized when creating the Scheduler obj.
// However, tests may need to manually initialize the shared pod informer.
podNamespace, podName := pod.Namespace, pod.Name
pod, err := ev.PodLister.Pods(pod.Namespace).Get(pod.Name)
if err != nil {
logger.Error(err, "Could not get the updated preemptor pod object", "pod", klog.KRef(podNamespace, podName))
return nil, framework.AsStatus(err)
}
// 1) Ensure the preemptor is eligible to preempt other pods.
nominatedNodeStatus := m.Get(pod.Status.NominatedNodeName)
if ok, msg := ev.PodEligibleToPreemptOthers(ctx, pod, nominatedNodeStatus); !ok {
logger.V(5).Info("Pod is not eligible for preemption", "pod", klog.KObj(pod), "reason", msg)
return nil, framework.NewStatus(framework.Unschedulable, msg)
}
// 2) Find all preemption candidates.
allNodes, err := ev.Handler.SnapshotSharedLister().NodeInfos().List()
if err != nil {
return nil, framework.AsStatus(err)
}
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
if err != nil && len(candidates) == 0 {
return nil, framework.AsStatus(err)
}
// Return a FitError only when there are no candidates that fit the pod.
if len(candidates) == 0 {
fitError := &framework.FitError{
Pod: pod,
NumAllNodes: len(allNodes),
Diagnosis: framework.Diagnosis{
NodeToStatus: nodeToStatusMap,
// Leave UnschedulablePlugins or PendingPlugins as nil as it won't be used on moving Pods.
},
}
fitError.Diagnosis.NodeToStatus.SetAbsentNodesStatus(framework.NewStatus(framework.UnschedulableAndUnresolvable, "Preemption is not helpful for scheduling"))
// Specify nominatedNodeName to clear the pod's nominatedNodeName status, if applicable.
return framework.NewPostFilterResultWithNominatedNode(""), framework.NewStatus(framework.Unschedulable, fitError.Error())
}
// 3) Interact with registered Extenders to filter out some candidates if needed.
candidates, status := ev.callExtenders(logger, pod, candidates)
if !status.IsSuccess() {
return nil, status
}
// 4) Find the best candidate.
bestCandidate := ev.SelectCandidate(ctx, candidates)
if bestCandidate == nil || len(bestCandidate.Name()) == 0 {
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
}
logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
// 5) Perform preparation work before nominating the selected candidate.
if ev.enableAsyncPreemption {
ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName)
} else {
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
return nil, status
}
}
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
}
// FindCandidates calculates a slice of preemption candidates.
// Each candidate is executable to make the given <pod> schedulable.
func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
if len(allNodes) == 0 {
return nil, nil, errors.New("no nodes available")
}
logger := klog.FromContext(ctx)
// Get a list of nodes with failed predicates (Unschedulable) that may be satisfied by removing pods from the node.
potentialNodes, err := m.NodesForStatusCode(ev.Handler.SnapshotSharedLister().NodeInfos(), framework.Unschedulable)
if err != nil {
return nil, nil, err
}
if len(potentialNodes) == 0 {
logger.V(3).Info("Preemption will not help schedule pod on any node", "pod", klog.KObj(pod))
// In this case, we should clean-up any existing nominated node name of the pod.
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), pod); err != nil {
logger.Error(err, "Could not clear the nominatedNodeName field of pod", "pod", klog.KObj(pod))
// We do not return as this error is not critical.
}
return nil, framework.NewDefaultNodeToStatus(), nil
}
pdbs, err := getPodDisruptionBudgets(ev.PdbLister)
if err != nil {
return nil, nil, err
}
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
}
// callExtenders calls given <extenders> to select the list of feasible candidates.
// We will only check <candidates> with extenders that support preemption.
// Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
// node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
func (ev *Evaluator) callExtenders(logger klog.Logger, pod *v1.Pod, candidates []Candidate) ([]Candidate, *framework.Status) {
extenders := ev.Handler.Extenders()
nodeLister := ev.Handler.SnapshotSharedLister().NodeInfos()
if len(extenders) == 0 {
return candidates, nil
}
// Migrate candidate slice to victimsMap to adapt to the Extender interface.
// It's only applicable for candidate slice that have unique nominated node name.
victimsMap := ev.CandidatesToVictimsMap(candidates)
if len(victimsMap) == 0 {
return candidates, nil
}
for _, extender := range extenders {
if !extender.SupportsPreemption() || !extender.IsInterested(pod) {
continue
}
nodeNameToVictims, err := extender.ProcessPreemption(pod, victimsMap, nodeLister)
if err != nil {
if extender.IsIgnorable() {
logger.Info("Skipped extender as it returned error and has ignorable flag set",
"extender", extender.Name(), "err", err)
continue
}
return nil, framework.AsStatus(err)
}
// Check if the returned victims are valid.
for nodeName, victims := range nodeNameToVictims {
if victims == nil || len(victims.Pods) == 0 {
if extender.IsIgnorable() {
delete(nodeNameToVictims, nodeName)
logger.Info("Ignored node for which the extender didn't report victims", "node", klog.KRef("", nodeName), "extender", extender.Name())
continue
}
return nil, framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeName))
}
}
// Replace victimsMap with new result after preemption. So the
// rest of extenders can continue use it as parameter.
victimsMap = nodeNameToVictims
// If node list becomes empty, no preemption can happen regardless of other extenders.
if len(victimsMap) == 0 {
break
}
}
var newCandidates []Candidate
for nodeName := range victimsMap {
newCandidates = append(newCandidates, &candidate{
victims: victimsMap[nodeName],
name: nodeName,
})
}
return newCandidates, nil
}
// SelectCandidate chooses the best-fit candidate from given <candidates> and return it.
// NOTE: This method is exported for easier testing in default preemption.
func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate) Candidate {
logger := klog.FromContext(ctx)
if len(candidates) == 0 {
return nil
}
if len(candidates) == 1 {
return candidates[0]
}
victimsMap := ev.CandidatesToVictimsMap(candidates)
scoreFuncs := ev.OrderedScoreFuncs(ctx, victimsMap)
candidateNode := pickOneNodeForPreemption(logger, victimsMap, scoreFuncs)
// Same as candidatesToVictimsMap, this logic is not applicable for out-of-tree
// preemption plugins that exercise different candidates on the same nominated node.
if victims := victimsMap[candidateNode]; victims != nil {
return &candidate{
victims: victims,
name: candidateNode,
}
}
// We shouldn't reach here.
logger.Error(errors.New("no candidate selected"), "Should not reach here", "candidates", candidates)
// To not break the whole flow, return the first candidate.
return candidates[0]
}
// prepareCandidate does some preparation work before nominating the selected candidate:
// - Evict the victim pods
// - Reject the victim pods if they are in waitingPod map
// - Clear the low-priority pods' nominatedNodeName status if needed
func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
fh := ev.Handler
cs := ev.Handler.ClientSet()
ctx, cancel := context.WithCancel(ctx)
defer cancel()
logger := klog.FromContext(ctx)
errCh := parallelize.NewErrorChannel()
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
errCh.SendErrorWithCancel(err, cancel)
}
}, ev.PluginName)
if err := errCh.ReceiveError(); err != nil {
return framework.AsStatus(err)
}
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
// Lower priority pods nominated to run on this node, may no longer fit on
// this node. So, we should remove their nomination. Removing their
// nomination updates these pods and moves them to the active queue. It
// lets scheduler find another place for them.
nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name())
if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil {
logger.Error(err, "Cannot clear 'NominatedNodeName' field")
// We do not return as this error is not critical.
}
return nil
}
// prepareCandidateAsync triggers a goroutine for some preparation work:
// - Evict the victim pods
// - Reject the victim pods if they are in waitingPod map
// - Clear the low-priority pods' nominatedNodeName status if needed
// The Pod won't be retried until the goroutine triggered here completes.
//
// See http://kep.k8s.io/4832 for how the async preemption works.
func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) {
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
// Intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
// because this process could continue even after this scheduling cycle finishes.
ctx, cancel := context.WithCancel(context.Background())
errCh := parallelize.NewErrorChannel()
preemptPod := func(index int) {
victim := c.Victims().Pods[index]
if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
errCh.SendErrorWithCancel(err, cancel)
}
}
ev.mu.Lock()
ev.preempting.Insert(pod.UID)
ev.mu.Unlock()
logger := klog.FromContext(ctx)
go func() {
startTime := time.Now()
result := metrics.GoroutineResultSuccess
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
defer func() {
if result == metrics.GoroutineResultError {
// When API call isn't successful, the Pod may get stuck in the unschedulable pod pool in the worst case.
// So, we should move the Pod to the activeQ.
ev.Handler.Activate(logger, map[string]*v1.Pod{pod.Name: pod})
}
}()
defer cancel()
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
// Lower priority pods nominated to run on this node, may no longer fit on
// this node. So, we should remove their nomination. Removing their
// nomination updates these pods and moves them to the active queue. It
// lets scheduler find another place for them.
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
logger.Error(err, "Cannot clear 'NominatedNodeName' field from lower priority pods on the same target node", "node", c.Name())
result = metrics.GoroutineResultError
// We do not return as this error is not critical.
}
if len(c.Victims().Pods) == 0 {
ev.mu.Lock()
delete(ev.preempting, pod.UID)
ev.mu.Unlock()
return
}
// We can evict all victims in parallel, but the last one.
// We have to remove the pod from the preempting map before the last one is evicted
// because, otherwise, the pod removal might be notified to the scheduling queue before
// we remove this pod from the preempting map,
// and the pod could end up stucking at the unschedulable pod pool
// by all the pod removal events being ignored.
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
if err := errCh.ReceiveError(); err != nil {
logger.Error(err, "Error occurred during async preemption")
result = metrics.GoroutineResultError
}
ev.mu.Lock()
delete(ev.preempting, pod.UID)
ev.mu.Unlock()
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
logger.Error(err, "Error occurred during async preemption")
result = metrics.GoroutineResultError
}
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result)
}()
}
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
if pdbLister != nil {
return pdbLister.List(labels.Everything())
}
return nil, nil
}
// pickOneNodeForPreemption chooses one node among the given nodes.
// It assumes pods in each map entry are ordered by decreasing priority.
// If the scoreFuns is not empty, It picks a node based on score scoreFuns returns.
// If the scoreFuns is empty,
// It picks a node based on the following criteria:
// 1. A node with minimum number of PDB violations.
// 2. A node with minimum highest priority victim is picked.
// 3. Ties are broken by sum of priorities of all victims.
// 4. If there are still ties, node with the minimum number of victims is picked.
// 5. If there are still ties, node with the latest start time of all highest priority victims is picked.
// 6. If there are still ties, the first such node is picked (sort of randomly).
// The 'minNodes1' and 'minNodes2' are being reused here to save the memory
// allocation and garbage collection time.
func pickOneNodeForPreemption(logger klog.Logger, nodesToVictims map[string]*extenderv1.Victims, scoreFuncs []func(node string) int64) string {
if len(nodesToVictims) == 0 {
return ""
}
allCandidates := make([]string, 0, len(nodesToVictims))
for node := range nodesToVictims {
allCandidates = append(allCandidates, node)
}
if len(scoreFuncs) == 0 {
minNumPDBViolatingScoreFunc := func(node string) int64 {
// The smaller the NumPDBViolations, the higher the score.
return -nodesToVictims[node].NumPDBViolations
}
minHighestPriorityScoreFunc := func(node string) int64 {
// highestPodPriority is the highest priority among the victims on this node.
highestPodPriority := corev1helpers.PodPriority(nodesToVictims[node].Pods[0])
// The smaller the highestPodPriority, the higher the score.
return -int64(highestPodPriority)
}
minSumPrioritiesScoreFunc := func(node string) int64 {
var sumPriorities int64
for _, pod := range nodesToVictims[node].Pods {
// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
// needed so that a node with a few pods with negative priority is not
// picked over a node with a smaller number of pods with the same negative
// priority (and similar scenarios).
sumPriorities += int64(corev1helpers.PodPriority(pod)) + int64(math.MaxInt32+1)
}
// The smaller the sumPriorities, the higher the score.
return -sumPriorities
}
minNumPodsScoreFunc := func(node string) int64 {
// The smaller the length of pods, the higher the score.
return -int64(len(nodesToVictims[node].Pods))
}
latestStartTimeScoreFunc := func(node string) int64 {
// Get the earliest start time of all pods on the current node.
earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node])
if earliestStartTimeOnNode == nil {
logger.Error(errors.New("earliestStartTime is nil for node"), "Should not reach here", "node", node)
return int64(math.MinInt64)
}
// The bigger the earliestStartTimeOnNode, the higher the score.
return earliestStartTimeOnNode.UnixNano()
}
// Each scoreFunc scores the nodes according to specific rules and keeps the name of the node
// with the highest score. If and only if the scoreFunc has more than one node with the highest
// score, we will execute the other scoreFunc in order of precedence.
scoreFuncs = []func(string) int64{
// A node with a minimum number of PDB is preferable.
minNumPDBViolatingScoreFunc,
// A node with a minimum highest priority victim is preferable.
minHighestPriorityScoreFunc,
// A node with the smallest sum of priorities is preferable.
minSumPrioritiesScoreFunc,
// A node with the minimum number of pods is preferable.
minNumPodsScoreFunc,
// A node with the latest start time of all highest priority victims is preferable.
latestStartTimeScoreFunc,
// If there are still ties, then the first Node in the list is selected.
}
}
for _, f := range scoreFuncs {
selectedNodes := []string{}
maxScore := int64(math.MinInt64)
for _, node := range allCandidates {
score := f(node)
if score > maxScore {
maxScore = score
selectedNodes = []string{}
}
if score == maxScore {
selectedNodes = append(selectedNodes, node)
}
}
if len(selectedNodes) == 1 {
return selectedNodes[0]
}
allCandidates = selectedNodes
}
return allCandidates[0]
}
// getLowerPriorityNominatedPods returns pods whose priority is smaller than the
// priority of the given "pod" and are nominated to run on the given node.
// Note: We could possibly check if the nominated lower priority pods still fit
// and return those that no longer fit, but that would require lots of
// manipulation of NodeInfo and PreFilter state per nominated pod. It may not be
// worth the complexity, especially because we generally expect to have a very
// small number of nominated pods per node.
func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod {
podInfos := pn.NominatedPodsForNode(nodeName)
if len(podInfos) == 0 {
return nil
}
var lowerPriorityPods []*v1.Pod
podPriority := corev1helpers.PodPriority(pod)
for _, pi := range podInfos {
if corev1helpers.PodPriority(pi.Pod) < podPriority {
lowerPriorityPods = append(lowerPriorityPods, pi.Pod)
}
}
return lowerPriorityPods
}
// DryRunPreemption simulates Preemption logic on <potentialNodes> in parallel,
// returns preemption candidates and a map indicating filtered nodes statuses.
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
// candidates, ones that do not violate PDB are preferred over ones that do.
// NOTE: This method is exported for easier testing in default preemption.
func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
fh := ev.Handler
nonViolatingCandidates := newCandidateList(candidatesNum)
violatingCandidates := newCandidateList(candidatesNum)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
nodeStatuses := framework.NewDefaultNodeToStatus()
logger := klog.FromContext(ctx)
logger.V(5).Info("Dry run the preemption", "potentialNodesNumber", len(potentialNodes), "pdbsNumber", len(pdbs), "offset", offset, "candidatesNumber", candidatesNum)
var statusesLock sync.Mutex
var errs []error
checkNode := func(i int) {
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
stateCopy := state.Clone()
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
if status.IsSuccess() && len(pods) != 0 {
victims := extenderv1.Victims{
Pods: pods,
NumPDBViolations: int64(numPDBViolations),
}
c := &candidate{
victims: &victims,
name: nodeInfoCopy.Node().Name,
}
if numPDBViolations == 0 {
nonViolatingCandidates.add(c)
} else {
violatingCandidates.add(c)
}
nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size()
if nvcSize > 0 && nvcSize+vcSize >= candidatesNum {
cancel()
}
return
}
if status.IsSuccess() && len(pods) == 0 {
status = framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeInfoCopy.Node().Name))
}
statusesLock.Lock()
if status.Code() == framework.Error {
errs = append(errs, status.AsError())
}
nodeStatuses.Set(nodeInfoCopy.Node().Name, status)
statusesLock.Unlock()
}
fh.Parallelizer().Until(ctx, len(potentialNodes), checkNode, ev.PluginName)
return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses, utilerrors.NewAggregate(errs)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,83 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"context"
v1 "k8s.io/api/core/v1"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
type instrumentedFilterPlugin struct {
framework.FilterPlugin
metric compbasemetrics.CounterMetric
}
var _ framework.FilterPlugin = &instrumentedFilterPlugin{}
func (p *instrumentedFilterPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
p.metric.Inc()
return p.FilterPlugin.Filter(ctx, state, pod, nodeInfo)
}
type instrumentedPreFilterPlugin struct {
framework.PreFilterPlugin
metric compbasemetrics.CounterMetric
}
var _ framework.PreFilterPlugin = &instrumentedPreFilterPlugin{}
func (p *instrumentedPreFilterPlugin) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
result, status := p.PreFilterPlugin.PreFilter(ctx, state, pod)
if !status.IsSkip() {
p.metric.Inc()
}
return result, status
}
type instrumentedPreScorePlugin struct {
framework.PreScorePlugin
metric compbasemetrics.CounterMetric
}
var _ framework.PreScorePlugin = &instrumentedPreScorePlugin{}
func (p *instrumentedPreScorePlugin) PreScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
status := p.PreScorePlugin.PreScore(ctx, state, pod, nodes)
if !status.IsSkip() {
p.metric.Inc()
}
return status
}
type instrumentedScorePlugin struct {
framework.ScorePlugin
metric compbasemetrics.CounterMetric
}
var _ framework.ScorePlugin = &instrumentedScorePlugin{}
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
p.metric.Inc()
return p.ScorePlugin.Score(ctx, state, pod, nodeName)
}

View File

@ -0,0 +1,101 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/json"
"k8s.io/kubernetes/pkg/scheduler/framework"
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"sigs.k8s.io/yaml"
)
// PluginFactory is a function that builds a plugin.
type PluginFactory = func(ctx context.Context, configuration runtime.Object, f framework.Handle) (framework.Plugin, error)
// PluginFactoryWithFts is a function that builds a plugin with certain feature gates.
type PluginFactoryWithFts func(context.Context, runtime.Object, framework.Handle, plfeature.Features) (framework.Plugin, error)
// FactoryAdapter can be used to inject feature gates for a plugin that needs
// them when the caller expects the older PluginFactory method.
func FactoryAdapter(fts plfeature.Features, withFts PluginFactoryWithFts) PluginFactory {
return func(ctx context.Context, plArgs runtime.Object, fh framework.Handle) (framework.Plugin, error) {
return withFts(ctx, plArgs, fh, fts)
}
}
// DecodeInto decodes configuration whose type is *runtime.Unknown to the interface into.
func DecodeInto(obj runtime.Object, into interface{}) error {
if obj == nil {
return nil
}
configuration, ok := obj.(*runtime.Unknown)
if !ok {
return fmt.Errorf("want args of type runtime.Unknown, got %T", obj)
}
if configuration.Raw == nil {
return nil
}
switch configuration.ContentType {
// If ContentType is empty, it means ContentTypeJSON by default.
case runtime.ContentTypeJSON, "":
return json.Unmarshal(configuration.Raw, into)
case runtime.ContentTypeYAML:
return yaml.Unmarshal(configuration.Raw, into)
default:
return fmt.Errorf("not supported content type %s", configuration.ContentType)
}
}
// Registry is a collection of all available plugins. The framework uses a
// registry to enable and initialize configured plugins.
// All plugins must be in the registry before initializing the framework.
type Registry map[string]PluginFactory
// Register adds a new plugin to the registry. If a plugin with the same name
// exists, it returns an error.
func (r Registry) Register(name string, factory PluginFactory) error {
if _, ok := r[name]; ok {
return fmt.Errorf("a plugin named %v already exists", name)
}
r[name] = factory
return nil
}
// Unregister removes an existing plugin from the registry. If no plugin with
// the provided name exists, it returns an error.
func (r Registry) Unregister(name string) error {
if _, ok := r[name]; !ok {
return fmt.Errorf("no plugin named %v exists", name)
}
delete(r, name)
return nil
}
// Merge merges the provided registry to the current one.
func (r Registry) Merge(in Registry) error {
for name, factory := range in {
if err := r.Register(name, factory); err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,165 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// waitingPodsMap a thread-safe map used to maintain pods waiting in the permit phase.
type waitingPodsMap struct {
pods map[types.UID]*waitingPod
mu sync.RWMutex
}
// NewWaitingPodsMap returns a new waitingPodsMap.
func NewWaitingPodsMap() *waitingPodsMap {
return &waitingPodsMap{
pods: make(map[types.UID]*waitingPod),
}
}
// add a new WaitingPod to the map.
func (m *waitingPodsMap) add(wp *waitingPod) {
m.mu.Lock()
defer m.mu.Unlock()
m.pods[wp.GetPod().UID] = wp
}
// remove a WaitingPod from the map.
func (m *waitingPodsMap) remove(uid types.UID) {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.pods, uid)
}
// get a WaitingPod from the map.
func (m *waitingPodsMap) get(uid types.UID) *waitingPod {
m.mu.RLock()
defer m.mu.RUnlock()
return m.pods[uid]
}
// iterate acquires a read lock and iterates over the WaitingPods map.
func (m *waitingPodsMap) iterate(callback func(framework.WaitingPod)) {
m.mu.RLock()
defer m.mu.RUnlock()
for _, v := range m.pods {
callback(v)
}
}
// waitingPod represents a pod waiting in the permit phase.
type waitingPod struct {
pod *v1.Pod
pendingPlugins map[string]*time.Timer
s chan *framework.Status
mu sync.RWMutex
}
var _ framework.WaitingPod = &waitingPod{}
// newWaitingPod returns a new waitingPod instance.
func newWaitingPod(pod *v1.Pod, pluginsMaxWaitTime map[string]time.Duration) *waitingPod {
wp := &waitingPod{
pod: pod,
// Allow() and Reject() calls are non-blocking. This property is guaranteed
// by using non-blocking send to this channel. This channel has a buffer of size 1
// to ensure that non-blocking send will not be ignored - possible situation when
// receiving from this channel happens after non-blocking send.
s: make(chan *framework.Status, 1),
}
wp.pendingPlugins = make(map[string]*time.Timer, len(pluginsMaxWaitTime))
// The time.AfterFunc calls wp.Reject which iterates through pendingPlugins map. Acquire the
// lock here so that time.AfterFunc can only execute after newWaitingPod finishes.
wp.mu.Lock()
defer wp.mu.Unlock()
for k, v := range pluginsMaxWaitTime {
plugin, waitTime := k, v
wp.pendingPlugins[plugin] = time.AfterFunc(waitTime, func() {
msg := fmt.Sprintf("rejected due to timeout after waiting %v at plugin %v",
waitTime, plugin)
wp.Reject(plugin, msg)
})
}
return wp
}
// GetPod returns a reference to the waiting pod.
func (w *waitingPod) GetPod() *v1.Pod {
return w.pod
}
// GetPendingPlugins returns a list of pending permit plugin's name.
func (w *waitingPod) GetPendingPlugins() []string {
w.mu.RLock()
defer w.mu.RUnlock()
plugins := make([]string, 0, len(w.pendingPlugins))
for p := range w.pendingPlugins {
plugins = append(plugins, p)
}
return plugins
}
// Allow declares the waiting pod is allowed to be scheduled by plugin pluginName.
// If this is the last remaining plugin to allow, then a success signal is delivered
// to unblock the pod.
func (w *waitingPod) Allow(pluginName string) {
w.mu.Lock()
defer w.mu.Unlock()
if timer, exist := w.pendingPlugins[pluginName]; exist {
timer.Stop()
delete(w.pendingPlugins, pluginName)
}
// Only signal success status after all plugins have allowed
if len(w.pendingPlugins) != 0 {
return
}
// The select clause works as a non-blocking send.
// If there is no receiver, it's a no-op (default case).
select {
case w.s <- framework.NewStatus(framework.Success, ""):
default:
}
}
// Reject declares the waiting pod unschedulable.
func (w *waitingPod) Reject(pluginName, msg string) {
w.mu.RLock()
defer w.mu.RUnlock()
for _, timer := range w.pendingPlugins {
timer.Stop()
}
// The select clause works as a non-blocking send.
// If there is no receiver, it's a no-op (default case).
select {
case w.s <- framework.NewStatus(framework.Unschedulable, msg).WithPlugin(pluginName):
default:
}
}

1302
vendor/k8s.io/kubernetes/pkg/scheduler/framework/types.go generated vendored Normal file

File diff suppressed because it is too large Load Diff