mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-06-14 18:53:35 +00:00
rebase: update K8s packages to v0.32.1
Update K8s packages in go.mod to v0.32.1 Signed-off-by: Praveen M <m.praveen@ibm.com>
This commit is contained in:
123
vendor/k8s.io/kubernetes/pkg/scheduler/framework/cycle_state.go
generated
vendored
Normal file
123
vendor/k8s.io/kubernetes/pkg/scheduler/framework/cycle_state.go
generated
vendored
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package framework
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrNotFound is the not found error message.
|
||||
ErrNotFound = errors.New("not found")
|
||||
)
|
||||
|
||||
// StateData is a generic type for arbitrary data stored in CycleState.
|
||||
type StateData interface {
|
||||
// Clone is an interface to make a copy of StateData. For performance reasons,
|
||||
// clone should make shallow copies for members (e.g., slices or maps) that are not
|
||||
// impacted by PreFilter's optional AddPod/RemovePod methods.
|
||||
Clone() StateData
|
||||
}
|
||||
|
||||
// StateKey is the type of keys stored in CycleState.
|
||||
type StateKey string
|
||||
|
||||
// CycleState provides a mechanism for plugins to store and retrieve arbitrary data.
|
||||
// StateData stored by one plugin can be read, altered, or deleted by another plugin.
|
||||
// CycleState does not provide any data protection, as all plugins are assumed to be
|
||||
// trusted.
|
||||
// Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios.
|
||||
// It is the recommended pattern used in all in-tree plugins - plugin-specific state is written once in PreFilter/PreScore and afterward read many times in Filter/Score.
|
||||
type CycleState struct {
|
||||
// storage is keyed with StateKey, and valued with StateData.
|
||||
storage sync.Map
|
||||
// if recordPluginMetrics is true, metrics.PluginExecutionDuration will be recorded for this cycle.
|
||||
recordPluginMetrics bool
|
||||
// SkipFilterPlugins are plugins that will be skipped in the Filter extension point.
|
||||
SkipFilterPlugins sets.Set[string]
|
||||
// SkipScorePlugins are plugins that will be skipped in the Score extension point.
|
||||
SkipScorePlugins sets.Set[string]
|
||||
}
|
||||
|
||||
// NewCycleState initializes a new CycleState and returns its pointer.
|
||||
func NewCycleState() *CycleState {
|
||||
return &CycleState{}
|
||||
}
|
||||
|
||||
// ShouldRecordPluginMetrics returns whether metrics.PluginExecutionDuration metrics should be recorded.
|
||||
func (c *CycleState) ShouldRecordPluginMetrics() bool {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
return c.recordPluginMetrics
|
||||
}
|
||||
|
||||
// SetRecordPluginMetrics sets recordPluginMetrics to the given value.
|
||||
func (c *CycleState) SetRecordPluginMetrics(flag bool) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.recordPluginMetrics = flag
|
||||
}
|
||||
|
||||
// Clone creates a copy of CycleState and returns its pointer. Clone returns
|
||||
// nil if the context being cloned is nil.
|
||||
func (c *CycleState) Clone() *CycleState {
|
||||
if c == nil {
|
||||
return nil
|
||||
}
|
||||
copy := NewCycleState()
|
||||
// Safe copy storage in case of overwriting.
|
||||
c.storage.Range(func(k, v interface{}) bool {
|
||||
copy.storage.Store(k, v.(StateData).Clone())
|
||||
return true
|
||||
})
|
||||
// The below are not mutated, so we don't have to safe copy.
|
||||
copy.recordPluginMetrics = c.recordPluginMetrics
|
||||
copy.SkipFilterPlugins = c.SkipFilterPlugins
|
||||
copy.SkipScorePlugins = c.SkipScorePlugins
|
||||
|
||||
return copy
|
||||
}
|
||||
|
||||
// Read retrieves data with the given "key" from CycleState. If the key is not
|
||||
// present, ErrNotFound is returned.
|
||||
//
|
||||
// See CycleState for notes on concurrency.
|
||||
func (c *CycleState) Read(key StateKey) (StateData, error) {
|
||||
if v, ok := c.storage.Load(key); ok {
|
||||
return v.(StateData), nil
|
||||
}
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
|
||||
// Write stores the given "val" in CycleState with the given "key".
|
||||
//
|
||||
// See CycleState for notes on concurrency.
|
||||
func (c *CycleState) Write(key StateKey, val StateData) {
|
||||
c.storage.Store(key, val)
|
||||
}
|
||||
|
||||
// Delete deletes data with the given key from CycleState.
|
||||
//
|
||||
// See CycleState for notes on concurrency.
|
||||
func (c *CycleState) Delete(key StateKey) {
|
||||
c.storage.Delete(key)
|
||||
}
|
229
vendor/k8s.io/kubernetes/pkg/scheduler/framework/events.go
generated
vendored
Normal file
229
vendor/k8s.io/kubernetes/pkg/scheduler/framework/events.go
generated
vendored
Normal file
@ -0,0 +1,229 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package framework
|
||||
|
||||
import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/equality"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/component-helpers/resource"
|
||||
"k8s.io/dynamic-resource-allocation/resourceclaim"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
)
|
||||
|
||||
// Special event labels.
|
||||
const (
|
||||
// ScheduleAttemptFailure is the event when a schedule attempt fails.
|
||||
ScheduleAttemptFailure = "ScheduleAttemptFailure"
|
||||
// BackoffComplete is the event when a pod finishes backoff.
|
||||
BackoffComplete = "BackoffComplete"
|
||||
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
|
||||
// to activeQ. Usually it's triggered by plugin implementations.
|
||||
ForceActivate = "ForceActivate"
|
||||
// UnschedulableTimeout is the event when a pod is moved from unschedulablePods
|
||||
// due to the timeout specified at pod-max-in-unschedulable-pods-duration.
|
||||
UnschedulableTimeout = "UnschedulableTimeout"
|
||||
)
|
||||
|
||||
var (
|
||||
// EventAssignedPodAdd is the event when an assigned pod is added.
|
||||
EventAssignedPodAdd = ClusterEvent{Resource: assignedPod, ActionType: Add}
|
||||
// EventAssignedPodUpdate is the event when an assigned pod is updated.
|
||||
EventAssignedPodUpdate = ClusterEvent{Resource: assignedPod, ActionType: Update}
|
||||
// EventAssignedPodDelete is the event when an assigned pod is deleted.
|
||||
EventAssignedPodDelete = ClusterEvent{Resource: assignedPod, ActionType: Delete}
|
||||
// EventUnscheduledPodAdd is the event when an unscheduled pod is added.
|
||||
EventUnscheduledPodAdd = ClusterEvent{Resource: unschedulablePod, ActionType: Add}
|
||||
// EventUnscheduledPodUpdate is the event when an unscheduled pod is updated.
|
||||
EventUnscheduledPodUpdate = ClusterEvent{Resource: unschedulablePod, ActionType: Update}
|
||||
// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
|
||||
EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
|
||||
// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
|
||||
EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, label: UnschedulableTimeout}
|
||||
// EventForceActivate is the event when a pod is moved from unschedulablePods/backoffQ to activeQ.
|
||||
EventForceActivate = ClusterEvent{Resource: WildCard, ActionType: All, label: ForceActivate}
|
||||
)
|
||||
|
||||
// PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
|
||||
// Once we have other pod update events, we should update here as well.
|
||||
func PodSchedulingPropertiesChange(newPod *v1.Pod, oldPod *v1.Pod) (events []ClusterEvent) {
|
||||
r := assignedPod
|
||||
if newPod.Spec.NodeName == "" {
|
||||
r = unschedulablePod
|
||||
}
|
||||
|
||||
podChangeExtracters := []podChangeExtractor{
|
||||
extractPodLabelsChange,
|
||||
extractPodScaleDown,
|
||||
extractPodSchedulingGateEliminatedChange,
|
||||
extractPodTolerationChange,
|
||||
}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
|
||||
podChangeExtracters = append(podChangeExtracters, extractPodGeneratedResourceClaimChange)
|
||||
}
|
||||
|
||||
for _, fn := range podChangeExtracters {
|
||||
if event := fn(newPod, oldPod); event != none {
|
||||
events = append(events, ClusterEvent{Resource: r, ActionType: event})
|
||||
}
|
||||
}
|
||||
|
||||
if len(events) == 0 {
|
||||
// When no specific event is found, we use AssignedPodOtherUpdate,
|
||||
// which should only trigger plugins registering a general Pod/Update event.
|
||||
events = append(events, ClusterEvent{Resource: r, ActionType: updatePodOther})
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
type podChangeExtractor func(newPod *v1.Pod, oldPod *v1.Pod) ActionType
|
||||
|
||||
// extractPodScaleDown interprets the update of a pod and returns PodRequestScaledDown event if any pod's resource request(s) is scaled down.
|
||||
func extractPodScaleDown(newPod, oldPod *v1.Pod) ActionType {
|
||||
opt := resource.PodResourcesOptions{
|
||||
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
}
|
||||
newPodRequests := resource.PodRequests(newPod, opt)
|
||||
oldPodRequests := resource.PodRequests(oldPod, opt)
|
||||
|
||||
for rName, oldReq := range oldPodRequests {
|
||||
newReq, ok := newPodRequests[rName]
|
||||
if !ok {
|
||||
// The resource request of rName is removed.
|
||||
return UpdatePodScaleDown
|
||||
}
|
||||
|
||||
if oldReq.MilliValue() > newReq.MilliValue() {
|
||||
// The resource request of rName is scaled down.
|
||||
return UpdatePodScaleDown
|
||||
}
|
||||
}
|
||||
|
||||
return none
|
||||
}
|
||||
|
||||
func extractPodLabelsChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
|
||||
if isLabelChanged(newPod.GetLabels(), oldPod.GetLabels()) {
|
||||
return UpdatePodLabel
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func extractPodTolerationChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
|
||||
if len(newPod.Spec.Tolerations) != len(oldPod.Spec.Tolerations) {
|
||||
// A Pod got a new toleration.
|
||||
// Due to API validation, the user can add, but cannot modify or remove tolerations.
|
||||
// So, it's enough to just check the length of tolerations to notice the update.
|
||||
// And, any updates in tolerations could make Pod schedulable.
|
||||
return UpdatePodTolerations
|
||||
}
|
||||
|
||||
return none
|
||||
}
|
||||
|
||||
func extractPodSchedulingGateEliminatedChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
|
||||
if len(newPod.Spec.SchedulingGates) == 0 && len(oldPod.Spec.SchedulingGates) != 0 {
|
||||
// A scheduling gate on the pod is completely removed.
|
||||
return UpdatePodSchedulingGatesEliminated
|
||||
}
|
||||
|
||||
return none
|
||||
}
|
||||
|
||||
func extractPodGeneratedResourceClaimChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
|
||||
if !resourceclaim.PodStatusEqual(newPod.Status.ResourceClaimStatuses, oldPod.Status.ResourceClaimStatuses) {
|
||||
return UpdatePodGeneratedResourceClaim
|
||||
}
|
||||
|
||||
return none
|
||||
}
|
||||
|
||||
// NodeSchedulingPropertiesChange interprets the update of a node and returns corresponding UpdateNodeXYZ event(s).
|
||||
func NodeSchedulingPropertiesChange(newNode *v1.Node, oldNode *v1.Node) (events []ClusterEvent) {
|
||||
nodeChangeExtracters := []nodeChangeExtractor{
|
||||
extractNodeSpecUnschedulableChange,
|
||||
extractNodeAllocatableChange,
|
||||
extractNodeLabelsChange,
|
||||
extractNodeTaintsChange,
|
||||
extractNodeConditionsChange,
|
||||
extractNodeAnnotationsChange,
|
||||
}
|
||||
|
||||
for _, fn := range nodeChangeExtracters {
|
||||
if event := fn(newNode, oldNode); event != none {
|
||||
events = append(events, ClusterEvent{Resource: Node, ActionType: event})
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type nodeChangeExtractor func(newNode *v1.Node, oldNode *v1.Node) ActionType
|
||||
|
||||
func extractNodeAllocatableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
if !equality.Semantic.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) {
|
||||
return UpdateNodeAllocatable
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func extractNodeLabelsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
if isLabelChanged(newNode.GetLabels(), oldNode.GetLabels()) {
|
||||
return UpdateNodeLabel
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func isLabelChanged(newLabels map[string]string, oldLabels map[string]string) bool {
|
||||
return !equality.Semantic.DeepEqual(newLabels, oldLabels)
|
||||
}
|
||||
|
||||
func extractNodeTaintsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
if !equality.Semantic.DeepEqual(newNode.Spec.Taints, oldNode.Spec.Taints) {
|
||||
return UpdateNodeTaint
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func extractNodeConditionsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
strip := func(conditions []v1.NodeCondition) map[v1.NodeConditionType]v1.ConditionStatus {
|
||||
conditionStatuses := make(map[v1.NodeConditionType]v1.ConditionStatus, len(conditions))
|
||||
for i := range conditions {
|
||||
conditionStatuses[conditions[i].Type] = conditions[i].Status
|
||||
}
|
||||
return conditionStatuses
|
||||
}
|
||||
if !equality.Semantic.DeepEqual(strip(oldNode.Status.Conditions), strip(newNode.Status.Conditions)) {
|
||||
return UpdateNodeCondition
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func extractNodeSpecUnschedulableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
if newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable && !newNode.Spec.Unschedulable {
|
||||
// TODO: create UpdateNodeSpecUnschedulable ActionType
|
||||
return UpdateNodeTaint
|
||||
}
|
||||
return none
|
||||
}
|
||||
|
||||
func extractNodeAnnotationsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
|
||||
if !equality.Semantic.DeepEqual(oldNode.GetAnnotations(), newNode.GetAnnotations()) {
|
||||
return UpdateNodeAnnotation
|
||||
}
|
||||
return none
|
||||
}
|
79
vendor/k8s.io/kubernetes/pkg/scheduler/framework/extender.go
generated
vendored
Normal file
79
vendor/k8s.io/kubernetes/pkg/scheduler/framework/extender.go
generated
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package framework
|
||||
|
||||
import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||
)
|
||||
|
||||
// Extender is an interface for external processes to influence scheduling
|
||||
// decisions made by Kubernetes. This is typically needed for resources not directly
|
||||
// managed by Kubernetes.
|
||||
type Extender interface {
|
||||
// Name returns a unique name that identifies the extender.
|
||||
Name() string
|
||||
|
||||
// Filter based on extender-implemented predicate functions. The filtered list is
|
||||
// expected to be a subset of the supplied list.
|
||||
// The failedNodes and failedAndUnresolvableNodes optionally contains the list
|
||||
// of failed nodes and failure reasons, except nodes in the latter are
|
||||
// unresolvable.
|
||||
Filter(pod *v1.Pod, nodes []*NodeInfo) (filteredNodes []*NodeInfo, failedNodesMap extenderv1.FailedNodesMap, failedAndUnresolvable extenderv1.FailedNodesMap, err error)
|
||||
|
||||
// Prioritize based on extender-implemented priority functions. The returned scores & weight
|
||||
// are used to compute the weighted score for an extender. The weighted scores are added to
|
||||
// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
|
||||
Prioritize(pod *v1.Pod, nodes []*NodeInfo) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error)
|
||||
|
||||
// Bind delegates the action of binding a pod to a node to the extender.
|
||||
Bind(binding *v1.Binding) error
|
||||
|
||||
// IsBinder returns whether this extender is configured for the Bind method.
|
||||
IsBinder() bool
|
||||
|
||||
// IsInterested returns true if at least one extended resource requested by
|
||||
// this pod is managed by this extender.
|
||||
IsInterested(pod *v1.Pod) bool
|
||||
|
||||
// IsPrioritizer returns whether this extender is configured for the Prioritize method.
|
||||
IsPrioritizer() bool
|
||||
|
||||
// IsFilter returns whether this extender is configured for the Filter method.
|
||||
IsFilter() bool
|
||||
|
||||
// ProcessPreemption returns nodes with their victim pods processed by extender based on
|
||||
// given:
|
||||
// 1. Pod to schedule
|
||||
// 2. Candidate nodes and victim pods (nodeNameToVictims) generated by previous scheduling process.
|
||||
// The possible changes made by extender may include:
|
||||
// 1. Subset of given candidate nodes after preemption phase of extender.
|
||||
// 2. A different set of victim pod for every given candidate node after preemption phase of extender.
|
||||
ProcessPreemption(
|
||||
pod *v1.Pod,
|
||||
nodeNameToVictims map[string]*extenderv1.Victims,
|
||||
nodeInfos NodeInfoLister,
|
||||
) (map[string]*extenderv1.Victims, error)
|
||||
|
||||
// SupportsPreemption returns if the scheduler extender support preemption or not.
|
||||
SupportsPreemption() bool
|
||||
|
||||
// IsIgnorable returns true indicates scheduling should not fail when this extender
|
||||
// is unavailable. This gives scheduler ability to fail fast and tolerate non-critical extenders as well.
|
||||
// Both Filter and Bind actions are supported.
|
||||
IsIgnorable() bool
|
||||
}
|
954
vendor/k8s.io/kubernetes/pkg/scheduler/framework/interface.go
generated
vendored
Normal file
954
vendor/k8s.io/kubernetes/pkg/scheduler/framework/interface.go
generated
vendored
Normal file
@ -0,0 +1,954 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
// This file defines the scheduling framework plugin interfaces.
|
||||
|
||||
package framework
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/informers"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
restclient "k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/events"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||
)
|
||||
|
||||
// NodeScoreList declares a list of nodes and their scores.
|
||||
type NodeScoreList []NodeScore
|
||||
|
||||
// NodeScore is a struct with node name and score.
|
||||
type NodeScore struct {
|
||||
Name string
|
||||
Score int64
|
||||
}
|
||||
|
||||
// NodeToStatusReader is a read-only interface of NodeToStatus passed to each PostFilter plugin.
|
||||
type NodeToStatusReader interface {
|
||||
// Get returns the status for given nodeName.
|
||||
// If the node is not in the map, the AbsentNodesStatus is returned.
|
||||
Get(nodeName string) *Status
|
||||
// NodesForStatusCode returns a list of NodeInfos for the nodes that have a given status code.
|
||||
// It returns the NodeInfos for all matching nodes denoted by AbsentNodesStatus as well.
|
||||
NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error)
|
||||
}
|
||||
|
||||
// NodeToStatusMap is an alias for NodeToStatusReader to keep partial backwards compatibility.
|
||||
// NodeToStatusReader should be used if possible.
|
||||
type NodeToStatusMap = NodeToStatusReader
|
||||
|
||||
// NodeToStatus contains the statuses of the Nodes where the incoming Pod was not schedulable.
|
||||
type NodeToStatus struct {
|
||||
// nodeToStatus contains specific statuses of the nodes.
|
||||
nodeToStatus map[string]*Status
|
||||
// absentNodesStatus defines a status for all nodes that are absent in nodeToStatus map.
|
||||
// By default, all absent nodes are UnschedulableAndUnresolvable.
|
||||
absentNodesStatus *Status
|
||||
}
|
||||
|
||||
// NewDefaultNodeToStatus creates NodeToStatus without any node in the map.
|
||||
// The absentNodesStatus is set by default to UnschedulableAndUnresolvable.
|
||||
func NewDefaultNodeToStatus() *NodeToStatus {
|
||||
return NewNodeToStatus(make(map[string]*Status), NewStatus(UnschedulableAndUnresolvable))
|
||||
}
|
||||
|
||||
// NewNodeToStatus creates NodeToStatus initialized with given nodeToStatus and absentNodesStatus.
|
||||
func NewNodeToStatus(nodeToStatus map[string]*Status, absentNodesStatus *Status) *NodeToStatus {
|
||||
return &NodeToStatus{
|
||||
nodeToStatus: nodeToStatus,
|
||||
absentNodesStatus: absentNodesStatus,
|
||||
}
|
||||
}
|
||||
|
||||
// Get returns the status for given nodeName. If the node is not in the map, the absentNodesStatus is returned.
|
||||
func (m *NodeToStatus) Get(nodeName string) *Status {
|
||||
if status, ok := m.nodeToStatus[nodeName]; ok {
|
||||
return status
|
||||
}
|
||||
return m.absentNodesStatus
|
||||
}
|
||||
|
||||
// Set sets status for given nodeName.
|
||||
func (m *NodeToStatus) Set(nodeName string, status *Status) {
|
||||
m.nodeToStatus[nodeName] = status
|
||||
}
|
||||
|
||||
// Len returns length of nodeToStatus map. It is not aware of number of absent nodes.
|
||||
func (m *NodeToStatus) Len() int {
|
||||
return len(m.nodeToStatus)
|
||||
}
|
||||
|
||||
// AbsentNodesStatus returns absentNodesStatus value.
|
||||
func (m *NodeToStatus) AbsentNodesStatus() *Status {
|
||||
return m.absentNodesStatus
|
||||
}
|
||||
|
||||
// SetAbsentNodesStatus sets absentNodesStatus value.
|
||||
func (m *NodeToStatus) SetAbsentNodesStatus(status *Status) {
|
||||
m.absentNodesStatus = status
|
||||
}
|
||||
|
||||
// ForEachExplicitNode runs fn for each node which status is explicitly set.
|
||||
// Imporatant note, it runs the fn only for nodes with a status explicitly registered,
|
||||
// and hence may not run the fn for all existing nodes.
|
||||
// For example, if PreFilter rejects all Nodes, the scheduler would NOT set a failure status to every Node,
|
||||
// but set a failure status as AbsentNodesStatus.
|
||||
// You're supposed to get a status from AbsentNodesStatus(), and consider all other nodes that are rejected by them.
|
||||
func (m *NodeToStatus) ForEachExplicitNode(fn func(nodeName string, status *Status)) {
|
||||
for nodeName, status := range m.nodeToStatus {
|
||||
fn(nodeName, status)
|
||||
}
|
||||
}
|
||||
|
||||
// NodesForStatusCode returns a list of NodeInfos for the nodes that matches a given status code.
|
||||
// If the absentNodesStatus matches the code, all existing nodes are fetched using nodeLister
|
||||
// and filtered using NodeToStatus.Get.
|
||||
// If the absentNodesStatus doesn't match the code, nodeToStatus map is used to create a list of nodes
|
||||
// and nodeLister.Get is used to obtain NodeInfo for each.
|
||||
func (m *NodeToStatus) NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error) {
|
||||
var resultNodes []*NodeInfo
|
||||
|
||||
if m.AbsentNodesStatus().Code() == code {
|
||||
allNodes, err := nodeLister.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if m.Len() == 0 {
|
||||
// All nodes are absent and status code is matching, so can return all nodes.
|
||||
return allNodes, nil
|
||||
}
|
||||
// Need to find all the nodes that are absent or have a matching code using the allNodes.
|
||||
for _, node := range allNodes {
|
||||
nodeName := node.Node().Name
|
||||
if status := m.Get(nodeName); status.Code() == code {
|
||||
resultNodes = append(resultNodes, node)
|
||||
}
|
||||
}
|
||||
return resultNodes, nil
|
||||
}
|
||||
|
||||
m.ForEachExplicitNode(func(nodeName string, status *Status) {
|
||||
if status.Code() == code {
|
||||
if nodeInfo, err := nodeLister.Get(nodeName); err == nil {
|
||||
resultNodes = append(resultNodes, nodeInfo)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return resultNodes, nil
|
||||
}
|
||||
|
||||
// NodePluginScores is a struct with node name and scores for that node.
|
||||
type NodePluginScores struct {
|
||||
// Name is node name.
|
||||
Name string
|
||||
// Scores is scores from plugins and extenders.
|
||||
Scores []PluginScore
|
||||
// TotalScore is the total score in Scores.
|
||||
TotalScore int64
|
||||
}
|
||||
|
||||
// PluginScore is a struct with plugin/extender name and score.
|
||||
type PluginScore struct {
|
||||
// Name is the name of plugin or extender.
|
||||
Name string
|
||||
Score int64
|
||||
}
|
||||
|
||||
// Code is the Status code/type which is returned from plugins.
|
||||
type Code int
|
||||
|
||||
// These are predefined codes used in a Status.
|
||||
// Note: when you add a new status, you have to add it in `codes` slice below.
|
||||
const (
|
||||
// Success means that plugin ran correctly and found pod schedulable.
|
||||
// NOTE: A nil status is also considered as "Success".
|
||||
Success Code = iota
|
||||
// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
|
||||
// Plugin shouldn't return this code for expected failures, like Unschedulable.
|
||||
// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
|
||||
// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
|
||||
Error
|
||||
// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
|
||||
// If it's returned from PreFilter or Filter, the scheduler might attempt to
|
||||
// run other postFilter plugins like preemption to get this pod scheduled.
|
||||
// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
|
||||
// The accompanying status message should explain why the pod is unschedulable.
|
||||
//
|
||||
// We regard the backoff as a penalty of wasting the scheduling cycle.
|
||||
// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
|
||||
// the Pod goes through backoff.
|
||||
Unschedulable
|
||||
// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
|
||||
// other postFilter plugins like preemption would not change anything.
|
||||
// See the comment on PostFilter interface for more details about how PostFilter should handle this status.
|
||||
// Plugins should return Unschedulable if it is possible that the pod can get scheduled
|
||||
// after running other postFilter plugins.
|
||||
// The accompanying status message should explain why the pod is unschedulable.
|
||||
//
|
||||
// We regard the backoff as a penalty of wasting the scheduling cycle.
|
||||
// When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling,
|
||||
// the Pod goes through backoff.
|
||||
UnschedulableAndUnresolvable
|
||||
// Wait is used when a Permit plugin finds a pod scheduling should wait.
|
||||
Wait
|
||||
// Skip is used in the following scenarios:
|
||||
// - when a Bind plugin chooses to skip binding.
|
||||
// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
|
||||
// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
|
||||
Skip
|
||||
// Pending means that the scheduling process is finished successfully,
|
||||
// but the plugin wants to stop the scheduling cycle/binding cycle here.
|
||||
//
|
||||
// For example, the DRA plugin sometimes needs to wait for the external device driver
|
||||
// to provision the resource for the Pod.
|
||||
// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
|
||||
// because in this case, the scheduler decides where the Pod can go successfully,
|
||||
// but we need to wait for the external component to do something based on that scheduling result.
|
||||
//
|
||||
// We regard the backoff as a penalty of wasting the scheduling cycle.
|
||||
// In the case of returning Pending, we cannot say the scheduling cycle is wasted
|
||||
// because the scheduling result is used to proceed the Pod's scheduling forward,
|
||||
// that particular scheduling cycle is failed though.
|
||||
// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
|
||||
// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
|
||||
// the Pod goes to activeQ directly ignoring backoff.
|
||||
Pending
|
||||
)
|
||||
|
||||
// This list should be exactly the same as the codes iota defined above in the same order.
|
||||
var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"}
|
||||
|
||||
func (c Code) String() string {
|
||||
return codes[c]
|
||||
}
|
||||
|
||||
const (
|
||||
// MaxNodeScore is the maximum score a Score plugin is expected to return.
|
||||
MaxNodeScore int64 = 100
|
||||
|
||||
// MinNodeScore is the minimum score a Score plugin is expected to return.
|
||||
MinNodeScore int64 = 0
|
||||
|
||||
// MaxTotalScore is the maximum total score.
|
||||
MaxTotalScore int64 = math.MaxInt64
|
||||
)
|
||||
|
||||
// PodsToActivateKey is a reserved state key for stashing pods.
|
||||
// If the stashed pods are present in unschedulablePods or backoffQ,they will be
|
||||
// activated (i.e., moved to activeQ) in two phases:
|
||||
// - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated)
|
||||
// - end of a binding cycle if it succeeds
|
||||
var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"
|
||||
|
||||
// PodsToActivate stores pods to be activated.
|
||||
type PodsToActivate struct {
|
||||
sync.Mutex
|
||||
// Map is keyed with namespaced pod name, and valued with the pod.
|
||||
Map map[string]*v1.Pod
|
||||
}
|
||||
|
||||
// Clone just returns the same state.
|
||||
func (s *PodsToActivate) Clone() StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// NewPodsToActivate instantiates a PodsToActivate object.
|
||||
func NewPodsToActivate() *PodsToActivate {
|
||||
return &PodsToActivate{Map: make(map[string]*v1.Pod)}
|
||||
}
|
||||
|
||||
// Status indicates the result of running a plugin. It consists of a code, a
|
||||
// message, (optionally) an error, and a plugin name it fails by.
|
||||
// When the status code is not Success, the reasons should explain why.
|
||||
// And, when code is Success, all the other fields should be empty.
|
||||
// NOTE: A nil Status is also considered as Success.
|
||||
type Status struct {
|
||||
code Code
|
||||
reasons []string
|
||||
err error
|
||||
// plugin is an optional field that records the plugin name causes this status.
|
||||
// It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending.
|
||||
plugin string
|
||||
}
|
||||
|
||||
func (s *Status) WithError(err error) *Status {
|
||||
s.err = err
|
||||
return s
|
||||
}
|
||||
|
||||
// Code returns code of the Status.
|
||||
func (s *Status) Code() Code {
|
||||
if s == nil {
|
||||
return Success
|
||||
}
|
||||
return s.code
|
||||
}
|
||||
|
||||
// Message returns a concatenated message on reasons of the Status.
|
||||
func (s *Status) Message() string {
|
||||
if s == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.Join(s.Reasons(), ", ")
|
||||
}
|
||||
|
||||
// SetPlugin sets the given plugin name to s.plugin.
|
||||
func (s *Status) SetPlugin(plugin string) {
|
||||
s.plugin = plugin
|
||||
}
|
||||
|
||||
// WithPlugin sets the given plugin name to s.plugin,
|
||||
// and returns the given status object.
|
||||
func (s *Status) WithPlugin(plugin string) *Status {
|
||||
s.SetPlugin(plugin)
|
||||
return s
|
||||
}
|
||||
|
||||
// Plugin returns the plugin name which caused this status.
|
||||
func (s *Status) Plugin() string {
|
||||
return s.plugin
|
||||
}
|
||||
|
||||
// Reasons returns reasons of the Status.
|
||||
func (s *Status) Reasons() []string {
|
||||
if s.err != nil {
|
||||
return append([]string{s.err.Error()}, s.reasons...)
|
||||
}
|
||||
return s.reasons
|
||||
}
|
||||
|
||||
// AppendReason appends given reason to the Status.
|
||||
func (s *Status) AppendReason(reason string) {
|
||||
s.reasons = append(s.reasons, reason)
|
||||
}
|
||||
|
||||
// IsSuccess returns true if and only if "Status" is nil or Code is "Success".
|
||||
func (s *Status) IsSuccess() bool {
|
||||
return s.Code() == Success
|
||||
}
|
||||
|
||||
// IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".
|
||||
func (s *Status) IsWait() bool {
|
||||
return s.Code() == Wait
|
||||
}
|
||||
|
||||
// IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".
|
||||
func (s *Status) IsSkip() bool {
|
||||
return s.Code() == Skip
|
||||
}
|
||||
|
||||
// IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).
|
||||
func (s *Status) IsRejected() bool {
|
||||
code := s.Code()
|
||||
return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending
|
||||
}
|
||||
|
||||
// AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object
|
||||
// with a concatenated message on reasons of the Status.
|
||||
func (s *Status) AsError() error {
|
||||
if s.IsSuccess() || s.IsWait() || s.IsSkip() {
|
||||
return nil
|
||||
}
|
||||
if s.err != nil {
|
||||
return s.err
|
||||
}
|
||||
return errors.New(s.Message())
|
||||
}
|
||||
|
||||
// Equal checks equality of two statuses. This is useful for testing with
|
||||
// cmp.Equal.
|
||||
func (s *Status) Equal(x *Status) bool {
|
||||
if s == nil || x == nil {
|
||||
return s.IsSuccess() && x.IsSuccess()
|
||||
}
|
||||
if s.code != x.code {
|
||||
return false
|
||||
}
|
||||
if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) {
|
||||
return false
|
||||
}
|
||||
if !cmp.Equal(s.reasons, x.reasons) {
|
||||
return false
|
||||
}
|
||||
return cmp.Equal(s.plugin, x.plugin)
|
||||
}
|
||||
|
||||
func (s *Status) String() string {
|
||||
return s.Message()
|
||||
}
|
||||
|
||||
// NewStatus makes a Status out of the given arguments and returns its pointer.
|
||||
func NewStatus(code Code, reasons ...string) *Status {
|
||||
s := &Status{
|
||||
code: code,
|
||||
reasons: reasons,
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// AsStatus wraps an error in a Status.
|
||||
func AsStatus(err error) *Status {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
return &Status{
|
||||
code: Error,
|
||||
err: err,
|
||||
}
|
||||
}
|
||||
|
||||
// WaitingPod represents a pod currently waiting in the permit phase.
|
||||
type WaitingPod interface {
|
||||
// GetPod returns a reference to the waiting pod.
|
||||
GetPod() *v1.Pod
|
||||
// GetPendingPlugins returns a list of pending Permit plugin's name.
|
||||
GetPendingPlugins() []string
|
||||
// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
|
||||
// If this is the last remaining plugin to allow, then a success signal is delivered
|
||||
// to unblock the pod.
|
||||
Allow(pluginName string)
|
||||
// Reject declares the waiting pod unschedulable.
|
||||
Reject(pluginName, msg string)
|
||||
}
|
||||
|
||||
// Plugin is the parent type for all the scheduling framework plugins.
|
||||
type Plugin interface {
|
||||
Name() string
|
||||
}
|
||||
|
||||
// PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
|
||||
// These plugins are called prior to adding Pods to activeQ.
|
||||
// Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
|
||||
// involve expensive calls like accessing external endpoints; otherwise it'd block other
|
||||
// Pods' enqueuing in event handlers.
|
||||
type PreEnqueuePlugin interface {
|
||||
Plugin
|
||||
// PreEnqueue is called prior to adding Pods to activeQ.
|
||||
PreEnqueue(ctx context.Context, p *v1.Pod) *Status
|
||||
}
|
||||
|
||||
// LessFunc is the function to sort pod info
|
||||
type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool
|
||||
|
||||
// QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins.
|
||||
// These plugins are used to sort pods in the scheduling queue. Only one queue sort
|
||||
// plugin may be enabled at a time.
|
||||
type QueueSortPlugin interface {
|
||||
Plugin
|
||||
// Less are used to sort pods in the scheduling queue.
|
||||
Less(*QueuedPodInfo, *QueuedPodInfo) bool
|
||||
}
|
||||
|
||||
// EnqueueExtensions is an optional interface that plugins can implement to efficiently
|
||||
// move unschedulable Pods in internal scheduling queues.
|
||||
// In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins,
|
||||
// and Pods rejected by these plugins are requeued based on this extension point.
|
||||
// Failures from other extension points are regarded as temporal errors (e.g., network failure),
|
||||
// and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff.
|
||||
// This is because such temporal errors cannot be resolved by specific cluster events,
|
||||
// and we have no choose but keep retrying scheduling until the failure is resolved.
|
||||
//
|
||||
// Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface,
|
||||
// otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin.
|
||||
// And, if plugins other than above extension points support this interface, they are just ignored.
|
||||
type EnqueueExtensions interface {
|
||||
Plugin
|
||||
// EventsToRegister returns a series of possible events that may cause a Pod
|
||||
// failed by this plugin schedulable. Each event has a callback function that
|
||||
// filters out events to reduce useless retry of Pod's scheduling.
|
||||
// The events will be registered when instantiating the internal scheduling queue,
|
||||
// and leveraged to build event handlers dynamically.
|
||||
// When it returns an error, the scheduler fails to start.
|
||||
// Note: the returned list needs to be determined at a startup,
|
||||
// and the scheduler only evaluates it once during start up.
|
||||
// Do not change the result during runtime, for example, based on the cluster's state etc.
|
||||
//
|
||||
// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
|
||||
EventsToRegister(context.Context) ([]ClusterEventWithHint, error)
|
||||
}
|
||||
|
||||
// PreFilterExtensions is an interface that is included in plugins that allow specifying
|
||||
// callbacks to make incremental updates to its supposedly pre-calculated
|
||||
// state.
|
||||
type PreFilterExtensions interface {
|
||||
// AddPod is called by the framework while trying to evaluate the impact
|
||||
// of adding podToAdd to the node while scheduling podToSchedule.
|
||||
AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
|
||||
// RemovePod is called by the framework while trying to evaluate the impact
|
||||
// of removing podToRemove from the node while scheduling podToSchedule.
|
||||
RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
|
||||
}
|
||||
|
||||
// PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins.
|
||||
// These plugins are called at the beginning of the scheduling cycle.
|
||||
type PreFilterPlugin interface {
|
||||
Plugin
|
||||
// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
|
||||
// plugins must return success or the pod will be rejected. PreFilter could optionally
|
||||
// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
|
||||
// for cases where it is possible to determine the subset of nodes to process in O(1) time.
|
||||
// When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable".
|
||||
// i.e., those Nodes will be out of the candidates of the preemption.
|
||||
//
|
||||
// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
|
||||
// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
|
||||
PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
|
||||
// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
|
||||
// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
|
||||
// modify its pre-processed info. The framework guarantees that the extensions
|
||||
// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
|
||||
// CycleState, and may call those functions more than once before calling
|
||||
// Filter again on a specific node.
|
||||
PreFilterExtensions() PreFilterExtensions
|
||||
}
|
||||
|
||||
// FilterPlugin is an interface for Filter plugins. These plugins are called at the
|
||||
// filter extension point for filtering out hosts that cannot run a pod.
|
||||
// This concept used to be called 'predicate' in the original scheduler.
|
||||
// These plugins should return "Success", "Unschedulable" or "Error" in Status.code.
|
||||
// However, the scheduler accepts other valid codes as well.
|
||||
// Anything other than "Success" will lead to exclusion of the given host from
|
||||
// running the pod.
|
||||
type FilterPlugin interface {
|
||||
Plugin
|
||||
// Filter is called by the scheduling framework.
|
||||
// All FilterPlugins should return "Success" to declare that
|
||||
// the given node fits the pod. If Filter doesn't return "Success",
|
||||
// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
|
||||
//
|
||||
// "Error" aborts pod scheduling and puts the pod into the backoff queue.
|
||||
//
|
||||
// For the node being evaluated, Filter plugins should look at the passed
|
||||
// nodeInfo reference for this particular node's information (e.g., pods
|
||||
// considered to be running on the node) instead of looking it up in the
|
||||
// NodeInfoSnapshot because we don't guarantee that they will be the same.
|
||||
// For example, during preemption, we may pass a copy of the original
|
||||
// nodeInfo object that has some pods removed from it to evaluate the
|
||||
// possibility of preempting them to schedule the target pod.
|
||||
Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
|
||||
}
|
||||
|
||||
// PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called
|
||||
// after a pod cannot be scheduled.
|
||||
type PostFilterPlugin interface {
|
||||
Plugin
|
||||
// PostFilter is called by the scheduling framework
|
||||
// when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable.
|
||||
// NodeToStatusReader has statuses that each Node got in PreFilter or Filter phase.
|
||||
//
|
||||
// If you're implementing a custom preemption with PostFilter, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of your plugin,
|
||||
// meaning NodeToStatusReader could have Nodes with UnschedulableAndUnresolvable
|
||||
// and the scheduling framework does call PostFilter plugins even when all Nodes in NodeToStatusReader are UnschedulableAndUnresolvable.
|
||||
//
|
||||
// A PostFilter plugin should return one of the following statuses:
|
||||
// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
|
||||
// - Success: the plugin gets executed successfully and the pod can be made schedulable.
|
||||
// - Error: the plugin aborts due to some internal error.
|
||||
//
|
||||
// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
|
||||
// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
|
||||
// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
|
||||
// preemptor pod's .spec.status.nominatedNodeName field.
|
||||
PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
|
||||
}
|
||||
|
||||
// PreScorePlugin is an interface for "PreScore" plugin. PreScore is an
|
||||
// informational extension point. Plugins will be called with a list of nodes
|
||||
// that passed the filtering phase. A plugin may use this data to update internal
|
||||
// state or to generate logs/metrics.
|
||||
type PreScorePlugin interface {
|
||||
Plugin
|
||||
// PreScore is called by the scheduling framework after a list of nodes
|
||||
// passed the filtering phase. All prescore plugins must return success or
|
||||
// the pod will be rejected
|
||||
// When it returns Skip status, other fields in status are just ignored,
|
||||
// and coupled Score plugin will be skipped in this scheduling cycle.
|
||||
PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status
|
||||
}
|
||||
|
||||
// ScoreExtensions is an interface for Score extended functionality.
|
||||
type ScoreExtensions interface {
|
||||
// NormalizeScore is called for all node scores produced by the same plugin's "Score"
|
||||
// method. A successful run of NormalizeScore will update the scores list and return
|
||||
// a success status.
|
||||
NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
|
||||
}
|
||||
|
||||
// ScorePlugin is an interface that must be implemented by "Score" plugins to rank
|
||||
// nodes that passed the filtering phase.
|
||||
type ScorePlugin interface {
|
||||
Plugin
|
||||
// Score is called on each filtered node. It must return success and an integer
|
||||
// indicating the rank of the node. All scoring plugins must return success or
|
||||
// the pod will be rejected.
|
||||
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
|
||||
|
||||
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
|
||||
ScoreExtensions() ScoreExtensions
|
||||
}
|
||||
|
||||
// ReservePlugin is an interface for plugins with Reserve and Unreserve
|
||||
// methods. These are meant to update the state of the plugin. This concept
|
||||
// used to be called 'assume' in the original scheduler. These plugins should
|
||||
// return only Success or Error in Status.code. However, the scheduler accepts
|
||||
// other valid codes as well. Anything other than Success will lead to
|
||||
// rejection of the pod.
|
||||
type ReservePlugin interface {
|
||||
Plugin
|
||||
// Reserve is called by the scheduling framework when the scheduler cache is
|
||||
// updated. If this method returns a failed Status, the scheduler will call
|
||||
// the Unreserve method for all enabled ReservePlugins.
|
||||
Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
|
||||
// Unreserve is called by the scheduling framework when a reserved pod was
|
||||
// rejected, an error occurred during reservation of subsequent plugins, or
|
||||
// in a later phase. The Unreserve method implementation must be idempotent
|
||||
// and may be called by the scheduler even if the corresponding Reserve
|
||||
// method for the same plugin was not called.
|
||||
Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
|
||||
}
|
||||
|
||||
// PreBindPlugin is an interface that must be implemented by "PreBind" plugins.
|
||||
// These plugins are called before a pod being scheduled.
|
||||
type PreBindPlugin interface {
|
||||
Plugin
|
||||
// PreBind is called before binding a pod. All prebind plugins must return
|
||||
// success or the pod will be rejected and won't be sent for binding.
|
||||
PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
|
||||
}
|
||||
|
||||
// PostBindPlugin is an interface that must be implemented by "PostBind" plugins.
|
||||
// These plugins are called after a pod is successfully bound to a node.
|
||||
type PostBindPlugin interface {
|
||||
Plugin
|
||||
// PostBind is called after a pod is successfully bound. These plugins are
|
||||
// informational. A common application of this extension point is for cleaning
|
||||
// up. If a plugin needs to clean-up its state after a pod is scheduled and
|
||||
// bound, PostBind is the extension point that it should register.
|
||||
PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
|
||||
}
|
||||
|
||||
// PermitPlugin is an interface that must be implemented by "Permit" plugins.
|
||||
// These plugins are called before a pod is bound to a node.
|
||||
type PermitPlugin interface {
|
||||
Plugin
|
||||
// Permit is called before binding a pod (and before prebind plugins). Permit
|
||||
// plugins are used to prevent or delay the binding of a Pod. A permit plugin
|
||||
// must return success or wait with timeout duration, or the pod will be rejected.
|
||||
// The pod will also be rejected if the wait timeout or the pod is rejected while
|
||||
// waiting. Note that if the plugin returns "wait", the framework will wait only
|
||||
// after running the remaining plugins given that no other plugin rejects the pod.
|
||||
Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
|
||||
}
|
||||
|
||||
// BindPlugin is an interface that must be implemented by "Bind" plugins. Bind
|
||||
// plugins are used to bind a pod to a Node.
|
||||
type BindPlugin interface {
|
||||
Plugin
|
||||
// Bind plugins will not be called until all pre-bind plugins have completed. Each
|
||||
// bind plugin is called in the configured order. A bind plugin may choose whether
|
||||
// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
|
||||
// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
|
||||
// it must return Skip in its Status code. If a bind plugin returns an Error, the
|
||||
// pod is rejected and will not be bound.
|
||||
Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
|
||||
}
|
||||
|
||||
// Framework manages the set of plugins in use by the scheduling framework.
|
||||
// Configured plugins are called at specified points in a scheduling context.
|
||||
type Framework interface {
|
||||
Handle
|
||||
|
||||
// PreEnqueuePlugins returns the registered preEnqueue plugins.
|
||||
PreEnqueuePlugins() []PreEnqueuePlugin
|
||||
|
||||
// EnqueueExtensions returns the registered Enqueue extensions.
|
||||
EnqueueExtensions() []EnqueueExtensions
|
||||
|
||||
// QueueSortFunc returns the function to sort pods in scheduling queue
|
||||
QueueSortFunc() LessFunc
|
||||
|
||||
// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
|
||||
// *Status and its code is set to non-success if any of the plugins returns
|
||||
// anything but Success. If a non-success status is returned, then the scheduling
|
||||
// cycle is aborted.
|
||||
// It also returns a PreFilterResult, which may influence what or how many nodes to
|
||||
// evaluate downstream.
|
||||
// The third returns value contains PreFilter plugin that rejected some or all Nodes with PreFilterResult.
|
||||
// But, note that it doesn't contain any plugin when a plugin rejects this Pod with non-success status,
|
||||
// not with PreFilterResult.
|
||||
RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status, sets.Set[string])
|
||||
|
||||
// RunPostFilterPlugins runs the set of configured PostFilter plugins.
|
||||
// PostFilter plugins can either be informational, in which case should be configured
|
||||
// to execute first and return Unschedulable status, or ones that try to change the
|
||||
// cluster state to make the pod potentially schedulable in a future scheduling cycle.
|
||||
RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
|
||||
|
||||
// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
|
||||
// *Status and its code is set to non-success if any of the plugins returns
|
||||
// anything but Success. If the Status code is "Unschedulable", it is
|
||||
// considered as a scheduling check failure, otherwise, it is considered as an
|
||||
// internal error. In either case the pod is not going to be bound.
|
||||
RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
|
||||
|
||||
// RunPostBindPlugins runs the set of configured PostBind plugins.
|
||||
RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
|
||||
|
||||
// RunReservePluginsReserve runs the Reserve method of the set of
|
||||
// configured Reserve plugins. If any of these calls returns an error, it
|
||||
// does not continue running the remaining ones and returns the error. In
|
||||
// such case, pod will not be scheduled.
|
||||
RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
|
||||
|
||||
// RunReservePluginsUnreserve runs the Unreserve method of the set of
|
||||
// configured Reserve plugins.
|
||||
RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
|
||||
|
||||
// RunPermitPlugins runs the set of configured Permit plugins. If any of these
|
||||
// plugins returns a status other than "Success" or "Wait", it does not continue
|
||||
// running the remaining plugins and returns an error. Otherwise, if any of the
|
||||
// plugins returns "Wait", then this function will create and add waiting pod
|
||||
// to a map of currently waiting pods and return status with "Wait" code.
|
||||
// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
|
||||
RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
|
||||
|
||||
// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
|
||||
WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status
|
||||
|
||||
// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
|
||||
// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
|
||||
// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
|
||||
// or "Success". If none of the plugins handled binding, RunBindPlugins returns
|
||||
// code=5("skip") status.
|
||||
RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
|
||||
|
||||
// HasFilterPlugins returns true if at least one Filter plugin is defined.
|
||||
HasFilterPlugins() bool
|
||||
|
||||
// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
|
||||
HasPostFilterPlugins() bool
|
||||
|
||||
// HasScorePlugins returns true if at least one Score plugin is defined.
|
||||
HasScorePlugins() bool
|
||||
|
||||
// ListPlugins returns a map of extension point name to list of configured Plugins.
|
||||
ListPlugins() *config.Plugins
|
||||
|
||||
// ProfileName returns the profile name associated to a profile.
|
||||
ProfileName() string
|
||||
|
||||
// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
|
||||
PercentageOfNodesToScore() *int32
|
||||
|
||||
// SetPodNominator sets the PodNominator
|
||||
SetPodNominator(nominator PodNominator)
|
||||
// SetPodActivator sets the PodActivator
|
||||
SetPodActivator(activator PodActivator)
|
||||
|
||||
// Close calls Close method of each plugin.
|
||||
Close() error
|
||||
}
|
||||
|
||||
// Handle provides data and some tools that plugins can use. It is
|
||||
// passed to the plugin factories at the time of plugin initialization. Plugins
|
||||
// must store and use this handle to call framework functions.
|
||||
type Handle interface {
|
||||
// PodNominator abstracts operations to maintain nominated Pods.
|
||||
PodNominator
|
||||
// PluginsRunner abstracts operations to run some plugins.
|
||||
PluginsRunner
|
||||
// PodActivator abstracts operations in the scheduling queue.
|
||||
PodActivator
|
||||
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
|
||||
// is taken at the beginning of a scheduling cycle and remains unchanged until
|
||||
// a pod finishes "Permit" point.
|
||||
//
|
||||
// It should be used only during scheduling cycle:
|
||||
// - There is no guarantee that the information remains unchanged in the binding phase of scheduling.
|
||||
// So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin)
|
||||
// otherwise, a concurrent read/write error might occur.
|
||||
// - There is no guarantee that the information is always up-to-date.
|
||||
// So, plugins shouldn't use it in QueueingHint and PreEnqueue
|
||||
// otherwise, they might make a decision based on stale information.
|
||||
//
|
||||
// Instead, they should use the resources getting from Informer created from SharedInformerFactory().
|
||||
SnapshotSharedLister() SharedLister
|
||||
|
||||
// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
|
||||
IterateOverWaitingPods(callback func(WaitingPod))
|
||||
|
||||
// GetWaitingPod returns a waiting pod given its UID.
|
||||
GetWaitingPod(uid types.UID) WaitingPod
|
||||
|
||||
// RejectWaitingPod rejects a waiting pod given its UID.
|
||||
// The return value indicates if the pod is waiting or not.
|
||||
RejectWaitingPod(uid types.UID) bool
|
||||
|
||||
// ClientSet returns a kubernetes clientSet.
|
||||
ClientSet() clientset.Interface
|
||||
|
||||
// KubeConfig returns the raw kube config.
|
||||
KubeConfig() *restclient.Config
|
||||
|
||||
// EventRecorder returns an event recorder.
|
||||
EventRecorder() events.EventRecorder
|
||||
|
||||
SharedInformerFactory() informers.SharedInformerFactory
|
||||
|
||||
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
|
||||
// A non-default implementation can be plugged into the framework to simulate the state of DRA objects.
|
||||
SharedDRAManager() SharedDRAManager
|
||||
|
||||
// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
|
||||
RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status
|
||||
|
||||
// Extenders returns registered scheduler extenders.
|
||||
Extenders() []Extender
|
||||
|
||||
// Parallelizer returns a parallelizer holding parallelism for scheduler.
|
||||
Parallelizer() parallelize.Parallelizer
|
||||
}
|
||||
|
||||
// PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.
|
||||
type PreFilterResult struct {
|
||||
// The set of nodes that should be considered downstream; if nil then
|
||||
// all nodes are eligible.
|
||||
NodeNames sets.Set[string]
|
||||
}
|
||||
|
||||
func (p *PreFilterResult) AllNodes() bool {
|
||||
return p == nil || p.NodeNames == nil
|
||||
}
|
||||
|
||||
func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult {
|
||||
if p.AllNodes() && in.AllNodes() {
|
||||
return nil
|
||||
}
|
||||
|
||||
r := PreFilterResult{}
|
||||
if p.AllNodes() {
|
||||
r.NodeNames = in.NodeNames.Clone()
|
||||
return &r
|
||||
}
|
||||
if in.AllNodes() {
|
||||
r.NodeNames = p.NodeNames.Clone()
|
||||
return &r
|
||||
}
|
||||
|
||||
r.NodeNames = p.NodeNames.Intersection(in.NodeNames)
|
||||
return &r
|
||||
}
|
||||
|
||||
type NominatingMode int
|
||||
|
||||
const (
|
||||
ModeNoop NominatingMode = iota
|
||||
ModeOverride
|
||||
)
|
||||
|
||||
type NominatingInfo struct {
|
||||
NominatedNodeName string
|
||||
NominatingMode NominatingMode
|
||||
}
|
||||
|
||||
// PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.
|
||||
type PostFilterResult struct {
|
||||
*NominatingInfo
|
||||
}
|
||||
|
||||
func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult {
|
||||
return &PostFilterResult{
|
||||
NominatingInfo: &NominatingInfo{
|
||||
NominatedNodeName: name,
|
||||
NominatingMode: ModeOverride,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (ni *NominatingInfo) Mode() NominatingMode {
|
||||
if ni == nil {
|
||||
return ModeNoop
|
||||
}
|
||||
return ni.NominatingMode
|
||||
}
|
||||
|
||||
// PodActivator abstracts operations in the scheduling queue.
|
||||
type PodActivator interface {
|
||||
// Activate moves the given pods to activeQ.
|
||||
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
|
||||
// the wildcard event is registered so that the pod will be requeued when it comes back.
|
||||
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
|
||||
// Activate would ignore the pod.
|
||||
Activate(logger klog.Logger, pods map[string]*v1.Pod)
|
||||
}
|
||||
|
||||
// PodNominator abstracts operations to maintain nominated Pods.
|
||||
type PodNominator interface {
|
||||
// AddNominatedPod adds the given pod to the nominator or
|
||||
// updates it if it already exists.
|
||||
AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
|
||||
// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
|
||||
DeleteNominatedPodIfExists(pod *v1.Pod)
|
||||
// UpdateNominatedPod updates the <oldPod> with <newPod>.
|
||||
UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
|
||||
// NominatedPodsForNode returns nominatedPods on the given node.
|
||||
NominatedPodsForNode(nodeName string) []*PodInfo
|
||||
}
|
||||
|
||||
// PluginsRunner abstracts operations to run some plugins.
|
||||
// This is used by preemption PostFilter plugins when evaluating the feasibility of
|
||||
// scheduling the pod on nodes when certain running pods get evicted.
|
||||
type PluginsRunner interface {
|
||||
// RunPreScorePlugins runs the set of configured PreScore plugins. If any
|
||||
// of these plugins returns any status other than "Success", the given pod is rejected.
|
||||
RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status
|
||||
// RunScorePlugins runs the set of configured scoring plugins.
|
||||
// It returns a list that stores scores from each plugin and total score for each Node.
|
||||
// It also returns *Status, which is set to non-success if any of the plugins returns
|
||||
// a non-success status.
|
||||
RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status)
|
||||
// RunFilterPlugins runs the set of configured Filter plugins for pod on
|
||||
// the given node. Note that for the node being evaluated, the passed nodeInfo
|
||||
// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
|
||||
// considered to be running on the node could be different). For example, during
|
||||
// preemption, we may pass a copy of the original nodeInfo object that has some pods
|
||||
// removed from it to evaluate the possibility of preempting them to
|
||||
// schedule the target pod.
|
||||
RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
|
||||
// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
|
||||
// PreFilter plugins. It returns directly if any of the plugins return any
|
||||
// status other than Success.
|
||||
RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
|
||||
// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
|
||||
// PreFilter plugins. It returns directly if any of the plugins return any
|
||||
// status other than Success.
|
||||
RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
|
||||
}
|
111
vendor/k8s.io/kubernetes/pkg/scheduler/framework/listers.go
generated
vendored
Normal file
111
vendor/k8s.io/kubernetes/pkg/scheduler/framework/listers.go
generated
vendored
Normal file
@ -0,0 +1,111 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package framework
|
||||
|
||||
import (
|
||||
resourceapi "k8s.io/api/resource/v1beta1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/dynamic-resource-allocation/structured"
|
||||
)
|
||||
|
||||
// NodeInfoLister interface represents anything that can list/get NodeInfo objects from node name.
|
||||
type NodeInfoLister interface {
|
||||
// List returns the list of NodeInfos.
|
||||
List() ([]*NodeInfo, error)
|
||||
// HavePodsWithAffinityList returns the list of NodeInfos of nodes with pods with affinity terms.
|
||||
HavePodsWithAffinityList() ([]*NodeInfo, error)
|
||||
// HavePodsWithRequiredAntiAffinityList returns the list of NodeInfos of nodes with pods with required anti-affinity terms.
|
||||
HavePodsWithRequiredAntiAffinityList() ([]*NodeInfo, error)
|
||||
// Get returns the NodeInfo of the given node name.
|
||||
Get(nodeName string) (*NodeInfo, error)
|
||||
}
|
||||
|
||||
// StorageInfoLister interface represents anything that handles storage-related operations and resources.
|
||||
type StorageInfoLister interface {
|
||||
// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
|
||||
// keyed in the format "namespace/name".
|
||||
IsPVCUsedByPods(key string) bool
|
||||
}
|
||||
|
||||
// SharedLister groups scheduler-specific listers.
|
||||
type SharedLister interface {
|
||||
NodeInfos() NodeInfoLister
|
||||
StorageInfos() StorageInfoLister
|
||||
}
|
||||
|
||||
// ResourceSliceLister can be used to obtain ResourceSlices.
|
||||
type ResourceSliceLister interface {
|
||||
// List returns a list of all ResourceSlices.
|
||||
List() ([]*resourceapi.ResourceSlice, error)
|
||||
}
|
||||
|
||||
// DeviceClassLister can be used to obtain DeviceClasses.
|
||||
type DeviceClassLister interface {
|
||||
// List returns a list of all DeviceClasses.
|
||||
List() ([]*resourceapi.DeviceClass, error)
|
||||
// Get returns the DeviceClass with the given className.
|
||||
Get(className string) (*resourceapi.DeviceClass, error)
|
||||
}
|
||||
|
||||
// ResourceClaimTracker can be used to obtain ResourceClaims, and track changes to ResourceClaims in-memory.
|
||||
//
|
||||
// If the claims are meant to be allocated in the API during the binding phase (when used by scheduler), the tracker helps avoid
|
||||
// race conditions between scheduling and binding phases (as well as between the binding phase and the informer cache update).
|
||||
//
|
||||
// If the binding phase is not run (e.g. when used by Cluster Autoscaler which only runs the scheduling phase, and simulates binding in-memory),
|
||||
// the tracker allows the framework user to obtain the claim allocations produced by the DRA plugin, and persist them outside of the API (e.g. in-memory).
|
||||
type ResourceClaimTracker interface {
|
||||
// List lists ResourceClaims. The result is guaranteed to immediately include any changes made via AssumeClaimAfterAPICall(),
|
||||
// and SignalClaimPendingAllocation().
|
||||
List() ([]*resourceapi.ResourceClaim, error)
|
||||
// Get works like List(), but for a single claim.
|
||||
Get(namespace, claimName string) (*resourceapi.ResourceClaim, error)
|
||||
// ListAllAllocatedDevices lists all allocated Devices from allocated ResourceClaims. The result is guaranteed to immediately include
|
||||
// any changes made via AssumeClaimAfterAPICall(), and SignalClaimPendingAllocation().
|
||||
ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error)
|
||||
|
||||
// SignalClaimPendingAllocation signals to the tracker that the given ResourceClaim will be allocated via an API call in the
|
||||
// binding phase. This change is immediately reflected in the result of List() and the other accessors.
|
||||
SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error
|
||||
// ClaimHasPendingAllocation answers whether a given claim has a pending allocation during the binding phase. It can be used to avoid
|
||||
// race conditions in subsequent scheduling phases.
|
||||
ClaimHasPendingAllocation(claimUID types.UID) bool
|
||||
// RemoveClaimPendingAllocation removes the pending allocation for the given ResourceClaim from the tracker if any was signaled via
|
||||
// SignalClaimPendingAllocation(). Returns whether there was a pending allocation to remove. List() and the other accessors immediately
|
||||
// stop reflecting the pending allocation in the results.
|
||||
RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool)
|
||||
|
||||
// AssumeClaimAfterAPICall signals to the tracker that an API call modifying the given ResourceClaim was made in the binding phase, and the
|
||||
// changes should be reflected in informers very soon. This change is immediately reflected in the result of List() and the other accessors.
|
||||
// This mechanism can be used to avoid race conditions between the informer update and subsequent scheduling phases.
|
||||
AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error
|
||||
// AssumedClaimRestore signals to the tracker that something went wrong with the API call modifying the given ResourceClaim, and
|
||||
// the changes won't be reflected in informers after all. List() and the other accessors immediately stop reflecting the assumed change,
|
||||
// and go back to the informer version.
|
||||
AssumedClaimRestore(namespace, claimName string)
|
||||
}
|
||||
|
||||
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
|
||||
// The plugin's default implementation obtains the objects from the API. A different implementation can be
|
||||
// plugged into the framework in order to simulate the state of DRA objects. For example, Cluster Autoscaler
|
||||
// can use this to provide the correct DRA object state to the DRA plugin when simulating scheduling changes in-memory.
|
||||
type SharedDRAManager interface {
|
||||
ResourceClaims() ResourceClaimTracker
|
||||
ResourceSlices() ResourceSliceLister
|
||||
DeviceClasses() DeviceClassLister
|
||||
}
|
59
vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/error_channel.go
generated
vendored
Normal file
59
vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/error_channel.go
generated
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package parallelize
|
||||
|
||||
import "context"
|
||||
|
||||
// ErrorChannel supports non-blocking send and receive operation to capture error.
|
||||
// A maximum of one error is kept in the channel and the rest of the errors sent
|
||||
// are ignored, unless the existing error is received and the channel becomes empty
|
||||
// again.
|
||||
type ErrorChannel struct {
|
||||
errCh chan error
|
||||
}
|
||||
|
||||
// SendError sends an error without blocking the sender.
|
||||
func (e *ErrorChannel) SendError(err error) {
|
||||
select {
|
||||
case e.errCh <- err:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// SendErrorWithCancel sends an error without blocking the sender and calls
|
||||
// cancel function.
|
||||
func (e *ErrorChannel) SendErrorWithCancel(err error, cancel context.CancelFunc) {
|
||||
e.SendError(err)
|
||||
cancel()
|
||||
}
|
||||
|
||||
// ReceiveError receives an error from channel without blocking on the receiver.
|
||||
func (e *ErrorChannel) ReceiveError() error {
|
||||
select {
|
||||
case err := <-e.errCh:
|
||||
return err
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// NewErrorChannel returns a new ErrorChannel.
|
||||
func NewErrorChannel() *ErrorChannel {
|
||||
return &ErrorChannel{
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
}
|
65
vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/parallelism.go
generated
vendored
Normal file
65
vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/parallelism.go
generated
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package parallelize
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
)
|
||||
|
||||
// DefaultParallelism is the default parallelism used in scheduler.
|
||||
const DefaultParallelism int = 16
|
||||
|
||||
// Parallelizer holds the parallelism for scheduler.
|
||||
type Parallelizer struct {
|
||||
parallelism int
|
||||
}
|
||||
|
||||
// NewParallelizer returns an object holding the parallelism.
|
||||
func NewParallelizer(p int) Parallelizer {
|
||||
return Parallelizer{parallelism: p}
|
||||
}
|
||||
|
||||
// chunkSizeFor returns a chunk size for the given number of items to use for
|
||||
// parallel work. The size aims to produce good CPU utilization.
|
||||
// returns max(1, min(sqrt(n), n/Parallelism))
|
||||
func chunkSizeFor(n, parallelism int) int {
|
||||
s := int(math.Sqrt(float64(n)))
|
||||
|
||||
if r := n/parallelism + 1; s > r {
|
||||
s = r
|
||||
} else if s < 1 {
|
||||
s = 1
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// Until is a wrapper around workqueue.ParallelizeUntil to use in scheduling algorithms.
|
||||
// A given operation will be a label that is recorded in the goroutine metric.
|
||||
func (p Parallelizer) Until(ctx context.Context, pieces int, doWorkPiece workqueue.DoWorkPieceFunc, operation string) {
|
||||
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
|
||||
withMetrics := func(piece int) {
|
||||
goroutinesMetric.Inc()
|
||||
doWorkPiece(piece)
|
||||
goroutinesMetric.Dec()
|
||||
}
|
||||
|
||||
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, withMetrics, workqueue.WithChunkSize(chunkSizeFor(pieces, p.parallelism)))
|
||||
}
|
3
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/README.md
generated
vendored
Normal file
3
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/README.md
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Scheduler Framework Plugins
|
||||
|
||||
Moved [here](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-scheduling/scheduler_framework_plugins.md).
|
63
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go
generated
vendored
Normal file
63
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder/default_binder.go
generated
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package defaultbinder
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
)
|
||||
|
||||
// Name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.DefaultBinder
|
||||
|
||||
// DefaultBinder binds pods to nodes using a k8s client.
|
||||
type DefaultBinder struct {
|
||||
handle framework.Handle
|
||||
}
|
||||
|
||||
var _ framework.BindPlugin = &DefaultBinder{}
|
||||
|
||||
// New creates a DefaultBinder.
|
||||
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
|
||||
return &DefaultBinder{handle: handle}, nil
|
||||
}
|
||||
|
||||
// Name returns the name of the plugin.
|
||||
func (b DefaultBinder) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Bind binds pods to nodes using the k8s client.
|
||||
func (b DefaultBinder) Bind(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) *framework.Status {
|
||||
logger := klog.FromContext(ctx)
|
||||
logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(p), "node", klog.KRef("", nodeName))
|
||||
binding := &v1.Binding{
|
||||
ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID},
|
||||
Target: v1.ObjectReference{Kind: "Node", Name: nodeName},
|
||||
}
|
||||
err := b.handle.ClientSet().CoreV1().Pods(binding.Namespace).Bind(ctx, binding, metav1.CreateOptions{})
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
return nil
|
||||
}
|
364
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go
generated
vendored
Normal file
364
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go
generated
vendored
Normal file
@ -0,0 +1,364 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package defaultpreemption
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"sort"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
policy "k8s.io/api/policy/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/informers"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
policylisters "k8s.io/client-go/listers/policy/v1"
|
||||
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/klog/v2"
|
||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/preemption"
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// Name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.DefaultPreemption
|
||||
|
||||
// DefaultPreemption is a PostFilter plugin implements the preemption logic.
|
||||
type DefaultPreemption struct {
|
||||
fh framework.Handle
|
||||
fts feature.Features
|
||||
args config.DefaultPreemptionArgs
|
||||
podLister corelisters.PodLister
|
||||
pdbLister policylisters.PodDisruptionBudgetLister
|
||||
Evaluator *preemption.Evaluator
|
||||
}
|
||||
|
||||
var _ framework.PostFilterPlugin = &DefaultPreemption{}
|
||||
var _ framework.PreEnqueuePlugin = &DefaultPreemption{}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *DefaultPreemption) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
args, ok := dpArgs.(*config.DefaultPreemptionArgs)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("got args of type %T, want *DefaultPreemptionArgs", dpArgs)
|
||||
}
|
||||
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||
pdbLister := getPDBLister(fh.SharedInformerFactory())
|
||||
|
||||
pl := DefaultPreemption{
|
||||
fh: fh,
|
||||
fts: fts,
|
||||
args: *args,
|
||||
podLister: podLister,
|
||||
pdbLister: pdbLister,
|
||||
}
|
||||
pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
|
||||
|
||||
return &pl, nil
|
||||
}
|
||||
|
||||
// PostFilter invoked at the postFilter extension point.
|
||||
func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
||||
defer func() {
|
||||
metrics.PreemptionAttempts.Inc()
|
||||
}()
|
||||
|
||||
result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
|
||||
msg := status.Message()
|
||||
if len(msg) > 0 {
|
||||
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
|
||||
}
|
||||
return result, status
|
||||
}
|
||||
|
||||
func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
|
||||
if !pl.fts.EnableAsyncPreemption {
|
||||
return nil
|
||||
}
|
||||
if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// The plugin moves the preemptor Pod to acviteQ/backoffQ once the preemption API calls are all done,
|
||||
// and we don't need to move the Pod with any events.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// calculateNumCandidates returns the number of candidates the FindCandidates
|
||||
// method must produce from dry running based on the constraints given by
|
||||
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
|
||||
// candidates returned will never be greater than <numNodes>.
|
||||
func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 {
|
||||
n := (numNodes * pl.args.MinCandidateNodesPercentage) / 100
|
||||
if n < pl.args.MinCandidateNodesAbsolute {
|
||||
n = pl.args.MinCandidateNodesAbsolute
|
||||
}
|
||||
if n > numNodes {
|
||||
n = numNodes
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// GetOffsetAndNumCandidates chooses a random offset and calculates the number
|
||||
// of candidates that should be shortlisted for dry running preemption.
|
||||
func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) {
|
||||
return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes)
|
||||
}
|
||||
|
||||
// This function is not applicable for out-of-tree preemption plugins that exercise
|
||||
// different preemption candidates on the same nominated node.
|
||||
func (pl *DefaultPreemption) CandidatesToVictimsMap(candidates []preemption.Candidate) map[string]*extenderv1.Victims {
|
||||
m := make(map[string]*extenderv1.Victims, len(candidates))
|
||||
for _, c := range candidates {
|
||||
m[c.Name()] = c.Victims()
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
|
||||
// for "pod" to be scheduled.
|
||||
func (pl *DefaultPreemption) SelectVictimsOnNode(
|
||||
ctx context.Context,
|
||||
state *framework.CycleState,
|
||||
pod *v1.Pod,
|
||||
nodeInfo *framework.NodeInfo,
|
||||
pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status) {
|
||||
logger := klog.FromContext(ctx)
|
||||
var potentialVictims []*framework.PodInfo
|
||||
removePod := func(rpi *framework.PodInfo) error {
|
||||
if err := nodeInfo.RemovePod(logger, rpi.Pod); err != nil {
|
||||
return err
|
||||
}
|
||||
status := pl.fh.RunPreFilterExtensionRemovePod(ctx, state, pod, rpi, nodeInfo)
|
||||
if !status.IsSuccess() {
|
||||
return status.AsError()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
addPod := func(api *framework.PodInfo) error {
|
||||
nodeInfo.AddPodInfo(api)
|
||||
status := pl.fh.RunPreFilterExtensionAddPod(ctx, state, pod, api, nodeInfo)
|
||||
if !status.IsSuccess() {
|
||||
return status.AsError()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// As the first step, remove all the lower priority pods from the node and
|
||||
// check if the given pod can be scheduled.
|
||||
podPriority := corev1helpers.PodPriority(pod)
|
||||
for _, pi := range nodeInfo.Pods {
|
||||
if corev1helpers.PodPriority(pi.Pod) < podPriority {
|
||||
potentialVictims = append(potentialVictims, pi)
|
||||
if err := removePod(pi); err != nil {
|
||||
return nil, 0, framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No potential victims are found, and so we don't need to evaluate the node again since its state didn't change.
|
||||
if len(potentialVictims) == 0 {
|
||||
return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, "No preemption victims found for incoming pod")
|
||||
}
|
||||
|
||||
// If the new pod does not fit after removing all the lower priority pods,
|
||||
// we are almost done and this node is not suitable for preemption. The only
|
||||
// condition that we could check is if the "pod" is failing to schedule due to
|
||||
// inter-pod affinity to one or more victims, but we have decided not to
|
||||
// support this case for performance reasons. Having affinity to lower
|
||||
// priority pods is not a recommended configuration anyway.
|
||||
if status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo); !status.IsSuccess() {
|
||||
return nil, 0, status
|
||||
}
|
||||
var victims []*v1.Pod
|
||||
numViolatingVictim := 0
|
||||
// Sort potentialVictims by pod priority from high to low, which ensures to
|
||||
// reprieve higher priority pods first.
|
||||
sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) })
|
||||
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
|
||||
// violating victims and then other non-violating ones. In both cases, we start
|
||||
// from the highest priority victims.
|
||||
violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs)
|
||||
reprievePod := func(pi *framework.PodInfo) (bool, error) {
|
||||
if err := addPod(pi); err != nil {
|
||||
return false, err
|
||||
}
|
||||
status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
|
||||
fits := status.IsSuccess()
|
||||
if !fits {
|
||||
if err := removePod(pi); err != nil {
|
||||
return false, err
|
||||
}
|
||||
rpi := pi.Pod
|
||||
victims = append(victims, rpi)
|
||||
logger.V(5).Info("Pod is a potential preemption victim on node", "pod", klog.KObj(rpi), "node", klog.KObj(nodeInfo.Node()))
|
||||
}
|
||||
return fits, nil
|
||||
}
|
||||
for _, p := range violatingVictims {
|
||||
if fits, err := reprievePod(p); err != nil {
|
||||
return nil, 0, framework.AsStatus(err)
|
||||
} else if !fits {
|
||||
numViolatingVictim++
|
||||
}
|
||||
}
|
||||
// Now we try to reprieve non-violating victims.
|
||||
for _, p := range nonViolatingVictims {
|
||||
if _, err := reprievePod(p); err != nil {
|
||||
return nil, 0, framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low.
|
||||
if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 {
|
||||
sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) })
|
||||
}
|
||||
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
|
||||
}
|
||||
|
||||
// PodEligibleToPreemptOthers returns one bool and one string. The bool
|
||||
// indicates whether this pod should be considered for preempting other pods or
|
||||
// not. The string includes the reason if this pod isn't eligible.
|
||||
// There're several reasons:
|
||||
// 1. The pod has a preemptionPolicy of Never.
|
||||
// 2. The pod has already preempted other pods and the victims are in their graceful termination period.
|
||||
// Currently we check the node that is nominated for this pod, and as long as there are
|
||||
// terminating pods on this node, we don't attempt to preempt more pods.
|
||||
func (pl *DefaultPreemption) PodEligibleToPreemptOthers(_ context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string) {
|
||||
if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever {
|
||||
return false, "not eligible due to preemptionPolicy=Never."
|
||||
}
|
||||
|
||||
nodeInfos := pl.fh.SnapshotSharedLister().NodeInfos()
|
||||
nomNodeName := pod.Status.NominatedNodeName
|
||||
if len(nomNodeName) > 0 {
|
||||
// If the pod's nominated node is considered as UnschedulableAndUnresolvable by the filters,
|
||||
// then the pod should be considered for preempting again.
|
||||
if nominatedNodeStatus.Code() == framework.UnschedulableAndUnresolvable {
|
||||
return true, ""
|
||||
}
|
||||
|
||||
if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil {
|
||||
podPriority := corev1helpers.PodPriority(pod)
|
||||
for _, p := range nodeInfo.Pods {
|
||||
if corev1helpers.PodPriority(p.Pod) < podPriority && podTerminatingByPreemption(p.Pod) {
|
||||
// There is a terminating pod on the nominated node.
|
||||
return false, "not eligible due to a terminating pod on the nominated node."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true, ""
|
||||
}
|
||||
|
||||
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
|
||||
func (pl *DefaultPreemption) OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// podTerminatingByPreemption returns true if the pod is in the termination state caused by scheduler preemption.
|
||||
func podTerminatingByPreemption(p *v1.Pod) bool {
|
||||
if p.DeletionTimestamp == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, condition := range p.Status.Conditions {
|
||||
if condition.Type == v1.DisruptionTarget {
|
||||
return condition.Status == v1.ConditionTrue && condition.Reason == v1.PodReasonPreemptionByScheduler
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
|
||||
// and "nonViolatingPods" based on whether their PDBs will be violated if they are
|
||||
// preempted.
|
||||
// This function is stable and does not change the order of received pods. So, if it
|
||||
// receives a sorted list, grouping will preserve the order of the input list.
|
||||
func filterPodsWithPDBViolation(podInfos []*framework.PodInfo, pdbs []*policy.PodDisruptionBudget) (violatingPodInfos, nonViolatingPodInfos []*framework.PodInfo) {
|
||||
pdbsAllowed := make([]int32, len(pdbs))
|
||||
for i, pdb := range pdbs {
|
||||
pdbsAllowed[i] = pdb.Status.DisruptionsAllowed
|
||||
}
|
||||
|
||||
for _, podInfo := range podInfos {
|
||||
pod := podInfo.Pod
|
||||
pdbForPodIsViolated := false
|
||||
// A pod with no labels will not match any PDB. So, no need to check.
|
||||
if len(pod.Labels) != 0 {
|
||||
for i, pdb := range pdbs {
|
||||
if pdb.Namespace != pod.Namespace {
|
||||
continue
|
||||
}
|
||||
selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
|
||||
if err != nil {
|
||||
// This object has an invalid selector, it does not match the pod
|
||||
continue
|
||||
}
|
||||
// A PDB with a nil or empty selector matches nothing.
|
||||
if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Existing in DisruptedPods means it has been processed in API server,
|
||||
// we don't treat it as a violating case.
|
||||
if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist {
|
||||
continue
|
||||
}
|
||||
// Only decrement the matched pdb when it's not in its <DisruptedPods>;
|
||||
// otherwise we may over-decrement the budget number.
|
||||
pdbsAllowed[i]--
|
||||
// We have found a matching PDB.
|
||||
if pdbsAllowed[i] < 0 {
|
||||
pdbForPodIsViolated = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if pdbForPodIsViolated {
|
||||
violatingPodInfos = append(violatingPodInfos, podInfo)
|
||||
} else {
|
||||
nonViolatingPodInfos = append(nonViolatingPodInfos, podInfo)
|
||||
}
|
||||
}
|
||||
return violatingPodInfos, nonViolatingPodInfos
|
||||
}
|
||||
|
||||
func getPDBLister(informerFactory informers.SharedInformerFactory) policylisters.PodDisruptionBudgetLister {
|
||||
return informerFactory.Policy().V1().PodDisruptionBudgets().Lister()
|
||||
}
|
9
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/OWNERS
generated
vendored
Normal file
9
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/OWNERS
generated
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
reviewers:
|
||||
- klueska
|
||||
- pohly
|
||||
- bart0sh
|
||||
labels:
|
||||
- sig/node
|
||||
- wg/device-management
|
175
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/allocateddevices.go
generated
vendored
Normal file
175
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/allocateddevices.go
generated
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
/*
|
||||
Copyright 2024 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package dynamicresources
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
resourceapi "k8s.io/api/resource/v1beta1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/tools/cache"
|
||||
"k8s.io/dynamic-resource-allocation/structured"
|
||||
"k8s.io/klog/v2"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
// foreachAllocatedDevice invokes the provided callback for each
|
||||
// device in the claim's allocation result which was allocated
|
||||
// exclusively for the claim.
|
||||
//
|
||||
// Devices allocated with admin access can be shared with other
|
||||
// claims and are skipped without invoking the callback.
|
||||
//
|
||||
// foreachAllocatedDevice does nothing if the claim is not allocated.
|
||||
func foreachAllocatedDevice(claim *resourceapi.ResourceClaim, cb func(deviceID structured.DeviceID)) {
|
||||
if claim.Status.Allocation == nil {
|
||||
return
|
||||
}
|
||||
for _, result := range claim.Status.Allocation.Devices.Results {
|
||||
// Kubernetes 1.31 did not set this, 1.32 always does.
|
||||
// Supporting 1.31 is not worth the additional code that
|
||||
// would have to be written (= looking up in request) because
|
||||
// it is extremely unlikely that there really is a result
|
||||
// that still exists in a cluster from 1.31 where this matters.
|
||||
if ptr.Deref(result.AdminAccess, false) {
|
||||
// Is not considered as allocated.
|
||||
continue
|
||||
}
|
||||
deviceID := structured.MakeDeviceID(result.Driver, result.Pool, result.Device)
|
||||
|
||||
// None of the users of this helper need to abort iterating,
|
||||
// therefore it's not supported as it only would add overhead.
|
||||
cb(deviceID)
|
||||
}
|
||||
}
|
||||
|
||||
// allocatedDevices reacts to events in a cache and maintains a set of all allocated devices.
|
||||
// This is cheaper than repeatedly calling List, making strings unique, and building the set
|
||||
// each time PreFilter is called.
|
||||
//
|
||||
// All methods are thread-safe. Get returns a cloned set.
|
||||
type allocatedDevices struct {
|
||||
logger klog.Logger
|
||||
|
||||
mutex sync.RWMutex
|
||||
ids sets.Set[structured.DeviceID]
|
||||
}
|
||||
|
||||
func newAllocatedDevices(logger klog.Logger) *allocatedDevices {
|
||||
return &allocatedDevices{
|
||||
logger: logger,
|
||||
ids: sets.New[structured.DeviceID](),
|
||||
}
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) Get() sets.Set[structured.DeviceID] {
|
||||
a.mutex.RLock()
|
||||
defer a.mutex.RUnlock()
|
||||
|
||||
return a.ids.Clone()
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) handlers() cache.ResourceEventHandler {
|
||||
return cache.ResourceEventHandlerFuncs{
|
||||
AddFunc: a.onAdd,
|
||||
UpdateFunc: a.onUpdate,
|
||||
DeleteFunc: a.onDelete,
|
||||
}
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) onAdd(obj any) {
|
||||
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
a.logger.Error(err, "unexpected object in allocatedDevices.onAdd")
|
||||
return
|
||||
}
|
||||
|
||||
if claim.Status.Allocation != nil {
|
||||
a.addDevices(claim)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) onUpdate(oldObj, newObj any) {
|
||||
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
a.logger.Error(err, "unexpected object in allocatedDevices.onUpdate")
|
||||
return
|
||||
}
|
||||
|
||||
switch {
|
||||
case originalClaim.Status.Allocation == nil && modifiedClaim.Status.Allocation != nil:
|
||||
a.addDevices(modifiedClaim)
|
||||
case originalClaim.Status.Allocation != nil && modifiedClaim.Status.Allocation == nil:
|
||||
a.removeDevices(originalClaim)
|
||||
default:
|
||||
// Nothing to do. Either both nil or both non-nil, in which case the content
|
||||
// also must be the same (immutable!).
|
||||
}
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) onDelete(obj any) {
|
||||
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
a.logger.Error(err, "unexpected object in allocatedDevices.onDelete")
|
||||
return
|
||||
}
|
||||
|
||||
a.removeDevices(claim)
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) addDevices(claim *resourceapi.ResourceClaim) {
|
||||
if claim.Status.Allocation == nil {
|
||||
return
|
||||
}
|
||||
// Locking of the mutex gets minimized by pre-computing what needs to be done
|
||||
// without holding the lock.
|
||||
deviceIDs := make([]structured.DeviceID, 0, 20)
|
||||
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
|
||||
a.logger.V(6).Info("Observed device allocation", "device", deviceID, "claim", klog.KObj(claim))
|
||||
deviceIDs = append(deviceIDs, deviceID)
|
||||
})
|
||||
|
||||
a.mutex.Lock()
|
||||
defer a.mutex.Unlock()
|
||||
for _, deviceID := range deviceIDs {
|
||||
a.ids.Insert(deviceID)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *allocatedDevices) removeDevices(claim *resourceapi.ResourceClaim) {
|
||||
if claim.Status.Allocation == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Locking of the mutex gets minimized by pre-computing what needs to be done
|
||||
// without holding the lock.
|
||||
deviceIDs := make([]structured.DeviceID, 0, 20)
|
||||
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
|
||||
a.logger.V(6).Info("Observed device deallocation", "device", deviceID, "claim", klog.KObj(claim))
|
||||
deviceIDs = append(deviceIDs, deviceID)
|
||||
})
|
||||
|
||||
a.mutex.Lock()
|
||||
defer a.mutex.Unlock()
|
||||
for _, deviceID := range deviceIDs {
|
||||
a.ids.Delete(deviceID)
|
||||
}
|
||||
}
|
226
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go
generated
vendored
Normal file
226
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go
generated
vendored
Normal file
@ -0,0 +1,226 @@
|
||||
/*
|
||||
Copyright 2024 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package dynamicresources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
resourceapi "k8s.io/api/resource/v1beta1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/informers"
|
||||
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
|
||||
"k8s.io/dynamic-resource-allocation/structured"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
|
||||
)
|
||||
|
||||
var _ framework.SharedDRAManager = &DefaultDRAManager{}
|
||||
|
||||
// DefaultDRAManager is the default implementation of SharedDRAManager. It obtains the DRA objects
|
||||
// from API informers, and uses an AssumeCache and a map of in-flight allocations in order
|
||||
// to avoid race conditions when modifying ResourceClaims.
|
||||
type DefaultDRAManager struct {
|
||||
resourceClaimTracker *claimTracker
|
||||
resourceSliceLister *resourceSliceLister
|
||||
deviceClassLister *deviceClassLister
|
||||
}
|
||||
|
||||
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
|
||||
logger := klog.FromContext(ctx)
|
||||
manager := &DefaultDRAManager{
|
||||
resourceClaimTracker: &claimTracker{
|
||||
cache: claimsCache,
|
||||
inFlightAllocations: &sync.Map{},
|
||||
allocatedDevices: newAllocatedDevices(logger),
|
||||
logger: logger,
|
||||
},
|
||||
resourceSliceLister: &resourceSliceLister{sliceLister: informerFactory.Resource().V1beta1().ResourceSlices().Lister()},
|
||||
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1beta1().DeviceClasses().Lister()},
|
||||
}
|
||||
|
||||
// Reacting to events is more efficient than iterating over the list
|
||||
// repeatedly in PreFilter.
|
||||
manager.resourceClaimTracker.cache.AddEventHandler(manager.resourceClaimTracker.allocatedDevices.handlers())
|
||||
|
||||
return manager
|
||||
}
|
||||
|
||||
func (s *DefaultDRAManager) ResourceClaims() framework.ResourceClaimTracker {
|
||||
return s.resourceClaimTracker
|
||||
}
|
||||
|
||||
func (s *DefaultDRAManager) ResourceSlices() framework.ResourceSliceLister {
|
||||
return s.resourceSliceLister
|
||||
}
|
||||
|
||||
func (s *DefaultDRAManager) DeviceClasses() framework.DeviceClassLister {
|
||||
return s.deviceClassLister
|
||||
}
|
||||
|
||||
var _ framework.ResourceSliceLister = &resourceSliceLister{}
|
||||
|
||||
type resourceSliceLister struct {
|
||||
sliceLister resourcelisters.ResourceSliceLister
|
||||
}
|
||||
|
||||
func (l *resourceSliceLister) List() ([]*resourceapi.ResourceSlice, error) {
|
||||
return l.sliceLister.List(labels.Everything())
|
||||
}
|
||||
|
||||
var _ framework.DeviceClassLister = &deviceClassLister{}
|
||||
|
||||
type deviceClassLister struct {
|
||||
classLister resourcelisters.DeviceClassLister
|
||||
}
|
||||
|
||||
func (l *deviceClassLister) Get(className string) (*resourceapi.DeviceClass, error) {
|
||||
return l.classLister.Get(className)
|
||||
}
|
||||
|
||||
func (l *deviceClassLister) List() ([]*resourceapi.DeviceClass, error) {
|
||||
return l.classLister.List(labels.Everything())
|
||||
}
|
||||
|
||||
var _ framework.ResourceClaimTracker = &claimTracker{}
|
||||
|
||||
type claimTracker struct {
|
||||
// cache enables temporarily storing a newer claim object
|
||||
// while the scheduler has allocated it and the corresponding object
|
||||
// update from the apiserver has not been processed by the claim
|
||||
// informer callbacks. ResourceClaimTracker get added here in PreBind and removed by
|
||||
// the informer callback (based on the "newer than" comparison in the
|
||||
// assume cache).
|
||||
//
|
||||
// It uses cache.MetaNamespaceKeyFunc to generate object names, which
|
||||
// therefore are "<namespace>/<name>".
|
||||
//
|
||||
// This is necessary to ensure that reconstructing the resource usage
|
||||
// at the start of a pod scheduling cycle doesn't reuse the resources
|
||||
// assigned to such a claim. Alternatively, claim allocation state
|
||||
// could also get tracked across pod scheduling cycles, but that
|
||||
// - adds complexity (need to carefully sync state with informer events
|
||||
// for claims and ResourceSlices)
|
||||
// - would make integration with cluster autoscaler harder because it would need
|
||||
// to trigger informer callbacks.
|
||||
cache *assumecache.AssumeCache
|
||||
// inFlightAllocations is a map from claim UUIDs to claim objects for those claims
|
||||
// for which allocation was triggered during a scheduling cycle and the
|
||||
// corresponding claim status update call in PreBind has not been done
|
||||
// yet. If another pod needs the claim, the pod is treated as "not
|
||||
// schedulable yet". The cluster event for the claim status update will
|
||||
// make it schedulable.
|
||||
//
|
||||
// This mechanism avoids the following problem:
|
||||
// - Pod A triggers allocation for claim X.
|
||||
// - Pod B shares access to that claim and gets scheduled because
|
||||
// the claim is assumed to be allocated.
|
||||
// - PreBind for pod B is called first, tries to update reservedFor and
|
||||
// fails because the claim is not really allocated yet.
|
||||
//
|
||||
// We could avoid the ordering problem by allowing either pod A or pod B
|
||||
// to set the allocation. But that is more complicated and leads to another
|
||||
// problem:
|
||||
// - Pod A and B get scheduled as above.
|
||||
// - PreBind for pod A gets called first, then fails with a temporary API error.
|
||||
// It removes the updated claim from the assume cache because of that.
|
||||
// - PreBind for pod B gets called next and succeeds with adding the
|
||||
// allocation and its own reservedFor entry.
|
||||
// - The assume cache is now not reflecting that the claim is allocated,
|
||||
// which could lead to reusing the same resource for some other claim.
|
||||
//
|
||||
// A sync.Map is used because in practice sharing of a claim between
|
||||
// pods is expected to be rare compared to per-pod claim, so we end up
|
||||
// hitting the "multiple goroutines read, write, and overwrite entries
|
||||
// for disjoint sets of keys" case that sync.Map is optimized for.
|
||||
inFlightAllocations *sync.Map
|
||||
allocatedDevices *allocatedDevices
|
||||
logger klog.Logger
|
||||
}
|
||||
|
||||
func (c *claimTracker) ClaimHasPendingAllocation(claimUID types.UID) bool {
|
||||
_, found := c.inFlightAllocations.Load(claimUID)
|
||||
return found
|
||||
}
|
||||
|
||||
func (c *claimTracker) SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error {
|
||||
c.inFlightAllocations.Store(claimUID, allocatedClaim)
|
||||
// There's no reason to return an error in this implementation, but the error is helpful for other implementations.
|
||||
// For example, implementations that have to deal with fake claims might want to return an error if the allocation
|
||||
// is for an invalid claim.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *claimTracker) RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool) {
|
||||
_, found := c.inFlightAllocations.LoadAndDelete(claimUID)
|
||||
return found
|
||||
}
|
||||
|
||||
func (c *claimTracker) Get(namespace, claimName string) (*resourceapi.ResourceClaim, error) {
|
||||
obj, err := c.cache.Get(namespace + "/" + claimName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
claim, ok := obj.(*resourceapi.ResourceClaim)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unexpected object type %T for assumed object %s/%s", obj, namespace, claimName)
|
||||
}
|
||||
return claim, nil
|
||||
}
|
||||
|
||||
func (c *claimTracker) List() ([]*resourceapi.ResourceClaim, error) {
|
||||
var result []*resourceapi.ResourceClaim
|
||||
// Probably not worth adding an index for?
|
||||
objs := c.cache.List(nil)
|
||||
for _, obj := range objs {
|
||||
claim, ok := obj.(*resourceapi.ResourceClaim)
|
||||
if ok {
|
||||
result = append(result, claim)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c *claimTracker) ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error) {
|
||||
// Start with a fresh set that matches the current known state of the
|
||||
// world according to the informers.
|
||||
allocated := c.allocatedDevices.Get()
|
||||
|
||||
// Whatever is in flight also has to be checked.
|
||||
c.inFlightAllocations.Range(func(key, value any) bool {
|
||||
claim := value.(*resourceapi.ResourceClaim)
|
||||
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
|
||||
c.logger.V(6).Info("Device is in flight for allocation", "device", deviceID, "claim", klog.KObj(claim))
|
||||
allocated.Insert(deviceID)
|
||||
})
|
||||
return true
|
||||
})
|
||||
// There's no reason to return an error in this implementation, but the error might be helpful for other implementations.
|
||||
return allocated, nil
|
||||
}
|
||||
|
||||
func (c *claimTracker) AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error {
|
||||
return c.cache.Assume(claim)
|
||||
}
|
||||
|
||||
func (c *claimTracker) AssumedClaimRestore(namespace, claimName string) {
|
||||
c.cache.Restore(namespace + "/" + claimName)
|
||||
}
|
905
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
generated
vendored
Normal file
905
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go
generated
vendored
Normal file
@ -0,0 +1,905 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package dynamicresources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"sync"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
resourceapi "k8s.io/api/resource/v1beta1"
|
||||
apiequality "k8s.io/apimachinery/pkg/api/equality"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
"k8s.io/dynamic-resource-allocation/cel"
|
||||
"k8s.io/dynamic-resource-allocation/resourceclaim"
|
||||
"k8s.io/dynamic-resource-allocation/structured"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in Registry and configurations.
|
||||
Name = names.DynamicResources
|
||||
|
||||
stateKey framework.StateKey = Name
|
||||
)
|
||||
|
||||
// The state is initialized in PreFilter phase. Because we save the pointer in
|
||||
// framework.CycleState, in the later phases we don't need to call Write method
|
||||
// to update the value
|
||||
type stateData struct {
|
||||
// A copy of all claims for the Pod (i.e. 1:1 match with
|
||||
// pod.Spec.ResourceClaims), initially with the status from the start
|
||||
// of the scheduling cycle. Each claim instance is read-only because it
|
||||
// might come from the informer cache. The instances get replaced when
|
||||
// the plugin itself successfully does an Update.
|
||||
//
|
||||
// Empty if the Pod has no claims.
|
||||
claims []*resourceapi.ResourceClaim
|
||||
|
||||
// Allocator handles claims with structured parameters.
|
||||
allocator *structured.Allocator
|
||||
|
||||
// mutex must be locked while accessing any of the fields below.
|
||||
mutex sync.Mutex
|
||||
|
||||
// The indices of all claims that:
|
||||
// - are allocated
|
||||
// - use delayed allocation or the builtin controller
|
||||
// - were not available on at least one node
|
||||
//
|
||||
// Set in parallel during Filter, so write access there must be
|
||||
// protected by the mutex. Used by PostFilter.
|
||||
unavailableClaims sets.Set[int]
|
||||
|
||||
informationsForClaim []informationForClaim
|
||||
|
||||
// nodeAllocations caches the result of Filter for the nodes.
|
||||
nodeAllocations map[string][]resourceapi.AllocationResult
|
||||
}
|
||||
|
||||
func (d *stateData) Clone() framework.StateData {
|
||||
return d
|
||||
}
|
||||
|
||||
type informationForClaim struct {
|
||||
// Node selector based on the claim status if allocated.
|
||||
availableOnNodes *nodeaffinity.NodeSelector
|
||||
|
||||
// Set by Reserved, published by PreBind.
|
||||
allocation *resourceapi.AllocationResult
|
||||
}
|
||||
|
||||
// DynamicResources is a plugin that ensures that ResourceClaims are allocated.
|
||||
type DynamicResources struct {
|
||||
enabled bool
|
||||
enableAdminAccess bool
|
||||
enableSchedulingQueueHint bool
|
||||
|
||||
fh framework.Handle
|
||||
clientset kubernetes.Interface
|
||||
celCache *cel.Cache
|
||||
draManager framework.SharedDRAManager
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
if !fts.EnableDynamicResourceAllocation {
|
||||
// Disabled, won't do anything.
|
||||
return &DynamicResources{}, nil
|
||||
}
|
||||
|
||||
pl := &DynamicResources{
|
||||
enabled: true,
|
||||
enableAdminAccess: fts.EnableDRAAdminAccess,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
|
||||
fh: fh,
|
||||
clientset: fh.ClientSet(),
|
||||
// This is a LRU cache for compiled CEL expressions. The most
|
||||
// recent 10 of them get reused across different scheduling
|
||||
// cycles.
|
||||
celCache: cel.NewCache(10),
|
||||
draManager: fh.SharedDRAManager(),
|
||||
}
|
||||
|
||||
return pl, nil
|
||||
}
|
||||
|
||||
var _ framework.PreEnqueuePlugin = &DynamicResources{}
|
||||
var _ framework.PreFilterPlugin = &DynamicResources{}
|
||||
var _ framework.FilterPlugin = &DynamicResources{}
|
||||
var _ framework.PostFilterPlugin = &DynamicResources{}
|
||||
var _ framework.ReservePlugin = &DynamicResources{}
|
||||
var _ framework.EnqueueExtensions = &DynamicResources{}
|
||||
var _ framework.PreBindPlugin = &DynamicResources{}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *DynamicResources) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *DynamicResources) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
if !pl.enabled {
|
||||
return nil, nil
|
||||
}
|
||||
// A resource might depend on node labels for topology filtering.
|
||||
// A new or updated node may make pods schedulable.
|
||||
//
|
||||
// A note about UpdateNodeTaint event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// When QHint is enabled, the problematic preCheck is already removed, and we can remove UpdateNodeTaint.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeLabel
|
||||
}
|
||||
|
||||
events := []framework.ClusterEventWithHint{
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange},
|
||||
// Adding the ResourceClaim name to the pod status makes pods waiting for their ResourceClaim schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodGeneratedResourceClaim}, QueueingHintFn: pl.isSchedulableAfterPodChange},
|
||||
// A pod might be waiting for a class to get created or modified.
|
||||
{Event: framework.ClusterEvent{Resource: framework.DeviceClass, ActionType: framework.Add | framework.Update}},
|
||||
// Adding or updating a ResourceSlice might make a pod schedulable because new resources became available.
|
||||
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterResourceSliceChange},
|
||||
}
|
||||
|
||||
return events, nil
|
||||
}
|
||||
|
||||
// PreEnqueue checks if there are known reasons why a pod currently cannot be
|
||||
// scheduled. When this fails, one of the registered events can trigger another
|
||||
// attempt.
|
||||
func (pl *DynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) {
|
||||
if !pl.enabled {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
|
||||
return statusUnschedulable(klog.FromContext(ctx), err.Error())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterClaimChange is invoked for add and update claim events reported by
|
||||
// an informer. It checks whether that change made a previously unschedulable
|
||||
// pod schedulable. It errs on the side of letting a pod scheduling attempt
|
||||
// happen. The delete claim event will not invoke it, so newObj will never be nil.
|
||||
func (pl *DynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
|
||||
}
|
||||
|
||||
usesClaim := false
|
||||
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
|
||||
if claim.UID == modifiedClaim.UID {
|
||||
usesClaim = true
|
||||
}
|
||||
}); err != nil {
|
||||
// This is not an unexpected error: we know that
|
||||
// foreachPodResourceClaim only returns errors for "not
|
||||
// schedulable".
|
||||
if loggerV := logger.V(6); loggerV.Enabled() {
|
||||
owner := metav1.GetControllerOf(modifiedClaim)
|
||||
loggerV.Info("pod is not schedulable after resource claim change", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "claimOwner", owner, "reason", err.Error())
|
||||
}
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if originalClaim != nil &&
|
||||
originalClaim.Status.Allocation != nil &&
|
||||
modifiedClaim.Status.Allocation == nil {
|
||||
// A claim with structured parameters was deallocated. This might have made
|
||||
// resources available for other pods.
|
||||
logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if !usesClaim {
|
||||
// This was not the claim the pod was waiting for.
|
||||
logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if originalClaim == nil {
|
||||
logger.V(5).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// Modifications may or may not be relevant. If the entire
|
||||
// status is as before, then something else must have changed
|
||||
// and we don't care. What happens in practice is that the
|
||||
// resource driver adds the finalizer.
|
||||
if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) {
|
||||
if loggerV := logger.V(7); loggerV.Enabled() {
|
||||
// Log more information.
|
||||
loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim))
|
||||
} else {
|
||||
logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
|
||||
}
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodChange is invoked for update pod events reported by
|
||||
// an informer. It checks whether that change adds the ResourceClaim(s) that the
|
||||
// pod has been waiting for.
|
||||
func (pl *DynamicResources) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedPod, err := schedutil.As[*v1.Pod](nil, newObj)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
|
||||
}
|
||||
|
||||
if pod.UID != modifiedPod.UID {
|
||||
logger.V(7).Info("pod is not schedulable after change in other pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if err := pl.foreachPodResourceClaim(modifiedPod, nil); err != nil {
|
||||
// This is not an unexpected error: we know that
|
||||
// foreachPodResourceClaim only returns errors for "not
|
||||
// schedulable".
|
||||
logger.V(6).Info("pod is not schedulable after being updated", "pod", klog.KObj(pod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("pod got updated and is schedulable", "pod", klog.KObj(pod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterResourceSliceChange is invoked for add and update slice events reported by
|
||||
// an informer. Such changes can make an unschedulable pod schedulable when the pod requests a device
|
||||
// and the change adds a suitable device.
|
||||
//
|
||||
// For the sake of faster execution and avoiding code duplication, isSchedulableAfterResourceSliceChange
|
||||
// only checks whether the pod uses claims. All of the more detailed checks are done in the scheduling
|
||||
// attempt.
|
||||
//
|
||||
// The delete claim event will not invoke it, so newObj will never be nil.
|
||||
func (pl *DynamicResources) isSchedulableAfterResourceSliceChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedSlice, err := schedutil.As[*resourceapi.ResourceSlice](oldObj, newObj)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterResourceSliceChange: %w", err)
|
||||
}
|
||||
|
||||
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
|
||||
// This is not an unexpected error: we know that
|
||||
// foreachPodResourceClaim only returns errors for "not
|
||||
// schedulable".
|
||||
logger.V(6).Info("pod is not schedulable after resource slice change", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice), "reason", err.Error())
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// We could check what got changed in the slice, but right now that's likely to be
|
||||
// about the spec (there's no status yet...).
|
||||
// We could check whether all claims use classic DRA, but that doesn't seem worth it.
|
||||
// Let's assume that changing the slice may make the pod schedulable.
|
||||
logger.V(5).Info("ResourceSlice change might make pod schedulable", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
|
||||
func (pl *DynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourceapi.ResourceClaim, error) {
|
||||
claims := make([]*resourceapi.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
|
||||
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
|
||||
// We store the pointer as returned by the lister. The
|
||||
// assumption is that if a claim gets modified while our code
|
||||
// runs, the cache will store a new pointer, not mutate the
|
||||
// existing object that we point to here.
|
||||
claims = append(claims, claim)
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return claims, nil
|
||||
}
|
||||
|
||||
// foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
|
||||
// It calls an optional handler for those claims that it finds.
|
||||
func (pl *DynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourceapi.ResourceClaim)) error {
|
||||
for _, resource := range pod.Spec.ResourceClaims {
|
||||
claimName, mustCheckOwner, err := resourceclaim.Name(pod, &resource)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// The claim name might be nil if no underlying resource claim
|
||||
// was generated for the referenced claim. There are valid use
|
||||
// cases when this might happen, so we simply skip it.
|
||||
if claimName == nil {
|
||||
continue
|
||||
}
|
||||
claim, err := pl.draManager.ResourceClaims().Get(pod.Namespace, *claimName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if claim.DeletionTimestamp != nil {
|
||||
return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
|
||||
}
|
||||
|
||||
if mustCheckOwner {
|
||||
if err := resourceclaim.IsForPod(pod, claim); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if cb != nil {
|
||||
cb(resource.Name, claim)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point to check if pod has all
|
||||
// immediate claims bound. UnschedulableAndUnresolvable is returned if
|
||||
// the pod cannot be scheduled at the moment on any node.
|
||||
func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
if !pl.enabled {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
// If the pod does not reference any claim, we don't need to do
|
||||
// anything for it. We just initialize an empty state to record that
|
||||
// observation for the other functions. This gets updated below
|
||||
// if we get that far.
|
||||
s := &stateData{}
|
||||
state.Write(stateKey, s)
|
||||
|
||||
claims, err := pl.podResourceClaims(pod)
|
||||
if err != nil {
|
||||
return nil, statusUnschedulable(logger, err.Error())
|
||||
}
|
||||
logger.V(5).Info("pod resource claims", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(claims))
|
||||
|
||||
// If the pod does not reference any claim,
|
||||
// DynamicResources Filter has nothing to do with the Pod.
|
||||
if len(claims) == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// All claims which the scheduler needs to allocate itself.
|
||||
allocateClaims := make([]*resourceapi.ResourceClaim, 0, len(claims))
|
||||
|
||||
s.informationsForClaim = make([]informationForClaim, len(claims))
|
||||
for index, claim := range claims {
|
||||
if claim.Status.Allocation != nil &&
|
||||
!resourceclaim.CanBeReserved(claim) &&
|
||||
!resourceclaim.IsReservedForPod(pod, claim) {
|
||||
// Resource is in use. The pod has to wait.
|
||||
return nil, statusUnschedulable(logger, "resourceclaim in use", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
|
||||
}
|
||||
|
||||
if claim.Status.Allocation != nil {
|
||||
if claim.Status.Allocation.NodeSelector != nil {
|
||||
nodeSelector, err := nodeaffinity.NewNodeSelector(claim.Status.Allocation.NodeSelector)
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
s.informationsForClaim[index].availableOnNodes = nodeSelector
|
||||
}
|
||||
} else {
|
||||
allocateClaims = append(allocateClaims, claim)
|
||||
|
||||
// Allocation in flight? Better wait for that
|
||||
// to finish, see inFlightAllocations
|
||||
// documentation for details.
|
||||
if pl.draManager.ResourceClaims().ClaimHasPendingAllocation(claim.UID) {
|
||||
return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s is in the process of being allocated", klog.KObj(claim)))
|
||||
}
|
||||
|
||||
// Check all requests and device classes. If a class
|
||||
// does not exist, scheduling cannot proceed, no matter
|
||||
// how the claim is being allocated.
|
||||
//
|
||||
// When using a control plane controller, a class might
|
||||
// have a node filter. This is useful for trimming the
|
||||
// initial set of potential nodes before we ask the
|
||||
// driver(s) for information about the specific pod.
|
||||
for _, request := range claim.Spec.Devices.Requests {
|
||||
if request.DeviceClassName == "" {
|
||||
return nil, statusError(logger, fmt.Errorf("request %s: unsupported request type", request.Name))
|
||||
}
|
||||
|
||||
_, err := pl.draManager.DeviceClasses().Get(request.DeviceClassName)
|
||||
if err != nil {
|
||||
// If the class cannot be retrieved, allocation cannot proceed.
|
||||
if apierrors.IsNotFound(err) {
|
||||
// Here we mark the pod as "unschedulable", so it'll sleep in
|
||||
// the unscheduleable queue until a DeviceClass event occurs.
|
||||
return nil, statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", request.Name, request.DeviceClassName))
|
||||
}
|
||||
// Other error, retry with backoff.
|
||||
return nil, statusError(logger, fmt.Errorf("request %s: look up device class: %w", request.Name, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(allocateClaims) > 0 {
|
||||
logger.V(5).Info("Preparing allocation with structured parameters", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(allocateClaims))
|
||||
|
||||
// Doing this over and over again for each pod could be avoided
|
||||
// by setting the allocator up once and then keeping it up-to-date
|
||||
// as changes are observed.
|
||||
//
|
||||
// But that would cause problems for using the plugin in the
|
||||
// Cluster Autoscaler. If this step here turns out to be
|
||||
// expensive, we may have to maintain and update state more
|
||||
// persistently.
|
||||
//
|
||||
// Claims (and thus their devices) are treated as "allocated" if they are in the assume cache
|
||||
// or currently their allocation is in-flight. This does not change
|
||||
// during filtering, so we can determine that once.
|
||||
allAllocatedDevices, err := pl.draManager.ResourceClaims().ListAllAllocatedDevices()
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
slices, err := pl.draManager.ResourceSlices().List()
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
allocator, err := structured.NewAllocator(ctx, pl.enableAdminAccess, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
s.allocator = allocator
|
||||
s.nodeAllocations = make(map[string][]resourceapi.AllocationResult)
|
||||
}
|
||||
|
||||
s.claims = claims
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *DynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getStateData(cs *framework.CycleState) (*stateData, error) {
|
||||
state, err := cs.Read(stateKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s, ok := state.(*stateData)
|
||||
if !ok {
|
||||
return nil, errors.New("unable to convert state into stateData")
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
// It evaluates if a pod can fit due to the resources it requests,
|
||||
// for both allocated and unallocated claims.
|
||||
//
|
||||
// For claims that are bound, then it checks that the node affinity is
|
||||
// satisfied by the given node.
|
||||
//
|
||||
// For claims that are unbound, it checks whether the claim might get allocated
|
||||
// for the node.
|
||||
func (pl *DynamicResources) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
if !pl.enabled {
|
||||
return nil
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return statusError(klog.FromContext(ctx), err)
|
||||
}
|
||||
if len(state.claims) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
node := nodeInfo.Node()
|
||||
|
||||
var unavailableClaims []int
|
||||
for index, claim := range state.claims {
|
||||
logger.V(10).Info("filtering based on resource claims of the pod", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
|
||||
|
||||
// This node selector only gets set if the claim is allocated.
|
||||
if nodeSelector := state.informationsForClaim[index].availableOnNodes; nodeSelector != nil && !nodeSelector.Match(node) {
|
||||
logger.V(5).Info("allocation's node selector does not match", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
|
||||
unavailableClaims = append(unavailableClaims, index)
|
||||
}
|
||||
}
|
||||
|
||||
// Use allocator to check the node and cache the result in case that the node is picked.
|
||||
var allocations []resourceapi.AllocationResult
|
||||
if state.allocator != nil {
|
||||
allocCtx := ctx
|
||||
if loggerV := logger.V(5); loggerV.Enabled() {
|
||||
allocCtx = klog.NewContext(allocCtx, klog.LoggerWithValues(logger, "node", klog.KObj(node)))
|
||||
}
|
||||
|
||||
a, err := state.allocator.Allocate(allocCtx, node)
|
||||
if err != nil {
|
||||
// This should only fail if there is something wrong with the claim or class.
|
||||
// Return an error to abort scheduling of it.
|
||||
//
|
||||
// This will cause retries. It would be slightly nicer to mark it as unschedulable
|
||||
// *and* abort scheduling. Then only cluster event for updating the claim or class
|
||||
// with the broken CEL expression would trigger rescheduling.
|
||||
//
|
||||
// But we cannot do both. As this shouldn't occur often, aborting like this is
|
||||
// better than the more complicated alternative (return Unschedulable here, remember
|
||||
// the error, then later raise it again later if needed).
|
||||
return statusError(logger, err, "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
|
||||
}
|
||||
// Check for exact length just to be sure. In practice this is all-or-nothing.
|
||||
if len(a) != len(state.allocator.ClaimsToAllocate()) {
|
||||
return statusUnschedulable(logger, "cannot allocate all claims", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
|
||||
}
|
||||
// Reserve uses this information.
|
||||
allocations = a
|
||||
}
|
||||
|
||||
// Store information in state while holding the mutex.
|
||||
if state.allocator != nil || len(unavailableClaims) > 0 {
|
||||
state.mutex.Lock()
|
||||
defer state.mutex.Unlock()
|
||||
}
|
||||
|
||||
if len(unavailableClaims) > 0 {
|
||||
// Remember all unavailable claims. This might be observed
|
||||
// concurrently, so we have to lock the state before writing.
|
||||
|
||||
if state.unavailableClaims == nil {
|
||||
state.unavailableClaims = sets.New[int]()
|
||||
}
|
||||
|
||||
for _, index := range unavailableClaims {
|
||||
state.unavailableClaims.Insert(index)
|
||||
}
|
||||
return statusUnschedulable(logger, "resourceclaim not available on the node", "pod", klog.KObj(pod))
|
||||
}
|
||||
|
||||
if state.allocator != nil {
|
||||
state.nodeAllocations[node.Name] = allocations
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PostFilter checks whether there are allocated claims that could get
|
||||
// deallocated to help get the Pod schedulable. If yes, it picks one and
|
||||
// requests its deallocation. This only gets called when filtering found no
|
||||
// suitable node.
|
||||
func (pl *DynamicResources) PostFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, filteredNodeStatusMap framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
||||
if !pl.enabled {
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
if len(state.claims) == 0 {
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "no new claims to deallocate")
|
||||
}
|
||||
|
||||
// Iterating over a map is random. This is intentional here, we want to
|
||||
// pick one claim randomly because there is no better heuristic.
|
||||
for index := range state.unavailableClaims {
|
||||
claim := state.claims[index]
|
||||
if len(claim.Status.ReservedFor) == 0 ||
|
||||
len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID {
|
||||
claim := claim.DeepCopy()
|
||||
claim.Status.ReservedFor = nil
|
||||
claim.Status.Allocation = nil
|
||||
logger.V(5).Info("Deallocation of ResourceClaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
|
||||
if _, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "deallocation of ResourceClaim completed")
|
||||
}
|
||||
}
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "still not schedulable")
|
||||
}
|
||||
|
||||
// Reserve reserves claims for the pod.
|
||||
func (pl *DynamicResources) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (status *framework.Status) {
|
||||
if !pl.enabled {
|
||||
return nil
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return statusError(klog.FromContext(ctx), err)
|
||||
}
|
||||
if len(state.claims) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
numClaimsWithAllocator := 0
|
||||
for _, claim := range state.claims {
|
||||
if claim.Status.Allocation != nil {
|
||||
// Allocated, but perhaps not reserved yet. We checked in PreFilter that
|
||||
// the pod could reserve the claim. Instead of reserving here by
|
||||
// updating the ResourceClaim status, we assume that reserving
|
||||
// will work and only do it for real during binding. If it fails at
|
||||
// that time, some other pod was faster and we have to try again.
|
||||
continue
|
||||
}
|
||||
|
||||
numClaimsWithAllocator++
|
||||
}
|
||||
|
||||
if numClaimsWithAllocator == 0 {
|
||||
// Nothing left to do.
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prepare allocation of claims handled by the schedulder.
|
||||
if state.allocator != nil {
|
||||
// Entries in these two slices match each other.
|
||||
claimsToAllocate := state.allocator.ClaimsToAllocate()
|
||||
allocations, ok := state.nodeAllocations[nodeName]
|
||||
if !ok {
|
||||
// We checked before that the node is suitable. This shouldn't have failed,
|
||||
// so treat this as an error.
|
||||
return statusError(logger, errors.New("claim allocation not found for node"))
|
||||
}
|
||||
|
||||
// Sanity check: do we have results for all pending claims?
|
||||
if len(allocations) != len(claimsToAllocate) ||
|
||||
len(allocations) != numClaimsWithAllocator {
|
||||
return statusError(logger, fmt.Errorf("internal error, have %d allocations, %d claims to allocate, want %d claims", len(allocations), len(claimsToAllocate), numClaimsWithAllocator))
|
||||
}
|
||||
|
||||
for i, claim := range claimsToAllocate {
|
||||
index := slices.Index(state.claims, claim)
|
||||
if index < 0 {
|
||||
return statusError(logger, fmt.Errorf("internal error, claim %s with allocation not found", claim.Name))
|
||||
}
|
||||
allocation := &allocations[i]
|
||||
state.informationsForClaim[index].allocation = allocation
|
||||
|
||||
// Strictly speaking, we don't need to store the full modified object.
|
||||
// The allocation would be enough. The full object is useful for
|
||||
// debugging, testing and the allocator, so let's make it realistic.
|
||||
claim = claim.DeepCopy()
|
||||
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
|
||||
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
|
||||
}
|
||||
claim.Status.Allocation = allocation
|
||||
err := pl.draManager.ResourceClaims().SignalClaimPendingAllocation(claim.UID, claim)
|
||||
if err != nil {
|
||||
return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name))
|
||||
}
|
||||
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unreserve clears the ReservedFor field for all claims.
|
||||
// It's idempotent, and does nothing if no state found for the given pod.
|
||||
func (pl *DynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
|
||||
if !pl.enabled {
|
||||
return
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(state.claims) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
for index, claim := range state.claims {
|
||||
// If allocation was in-flight, then it's not anymore and we need to revert the
|
||||
// claim object in the assume cache to what it was before.
|
||||
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(state.claims[index].UID); deleted {
|
||||
pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name)
|
||||
}
|
||||
|
||||
if claim.Status.Allocation != nil &&
|
||||
resourceclaim.IsReservedForPod(pod, claim) {
|
||||
// Remove pod from ReservedFor. A strategic-merge-patch is used
|
||||
// because that allows removing an individual entry without having
|
||||
// the latest slice.
|
||||
patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": { "reservedFor": [ {"$patch": "delete", "uid": %q} ] }}`,
|
||||
claim.UID,
|
||||
pod.UID,
|
||||
)
|
||||
logger.V(5).Info("unreserve", "resourceclaim", klog.KObj(claim), "pod", klog.KObj(pod))
|
||||
claim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
|
||||
if err != nil {
|
||||
// We will get here again when pod scheduling is retried.
|
||||
logger.Error(err, "unreserve", "resourceclaim", klog.KObj(claim))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PreBind gets called in a separate goroutine after it has been determined
|
||||
// that the pod should get bound to this node. Because Reserve did not actually
|
||||
// reserve claims, we need to do it now. For claims with the builtin controller,
|
||||
// we also handle the allocation.
|
||||
//
|
||||
// If anything fails, we return an error and
|
||||
// the pod will have to go into the backoff queue. The scheduler will call
|
||||
// Unreserve as part of the error handling.
|
||||
func (pl *DynamicResources) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
|
||||
if !pl.enabled {
|
||||
return nil
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return statusError(klog.FromContext(ctx), err)
|
||||
}
|
||||
if len(state.claims) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
for index, claim := range state.claims {
|
||||
if !resourceclaim.IsReservedForPod(pod, claim) {
|
||||
claim, err := pl.bindClaim(ctx, state, index, pod, nodeName)
|
||||
if err != nil {
|
||||
return statusError(logger, err)
|
||||
}
|
||||
state.claims[index] = claim
|
||||
}
|
||||
}
|
||||
// If we get here, we know that reserving the claim for
|
||||
// the pod worked and we can proceed with binding it.
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindClaim gets called by PreBind for claim which is not reserved for the pod yet.
|
||||
// It might not even be allocated. bindClaim then ensures that the allocation
|
||||
// and reservation are recorded. This finishes the work started in Reserve.
|
||||
func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourceapi.ResourceClaim, finalErr error) {
|
||||
logger := klog.FromContext(ctx)
|
||||
claim := state.claims[index].DeepCopy()
|
||||
allocation := state.informationsForClaim[index].allocation
|
||||
defer func() {
|
||||
if allocation != nil {
|
||||
// The scheduler was handling allocation. Now that has
|
||||
// completed, either successfully or with a failure.
|
||||
if finalErr == nil {
|
||||
// This can fail, but only for reasons that are okay (concurrent delete or update).
|
||||
// Shouldn't happen in this case.
|
||||
if err := pl.draManager.ResourceClaims().AssumeClaimAfterAPICall(claim); err != nil {
|
||||
logger.V(5).Info("Claim not stored in assume cache", "err", finalErr)
|
||||
}
|
||||
}
|
||||
pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID)
|
||||
}
|
||||
}()
|
||||
|
||||
logger.V(5).Info("preparing claim status update", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation))
|
||||
|
||||
// We may run into a ResourceVersion conflict because there may be some
|
||||
// benign concurrent changes. In that case we get the latest claim and
|
||||
// try again.
|
||||
refreshClaim := false
|
||||
retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
||||
if refreshClaim {
|
||||
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Get(ctx, claim.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("get updated claim %s after conflict: %w", klog.KObj(claim), err)
|
||||
}
|
||||
logger.V(5).Info("retrying update after conflict", "claim", klog.KObj(claim))
|
||||
claim = updatedClaim
|
||||
} else {
|
||||
// All future retries must get a new claim first.
|
||||
refreshClaim = true
|
||||
}
|
||||
|
||||
if claim.DeletionTimestamp != nil {
|
||||
return fmt.Errorf("claim %s got deleted in the meantime", klog.KObj(claim))
|
||||
}
|
||||
|
||||
// Do we need to store an allocation result from Reserve?
|
||||
if allocation != nil {
|
||||
if claim.Status.Allocation != nil {
|
||||
return fmt.Errorf("claim %s got allocated elsewhere in the meantime", klog.KObj(claim))
|
||||
}
|
||||
|
||||
// The finalizer needs to be added in a normal update.
|
||||
// If we were interrupted in the past, it might already be set and we simply continue.
|
||||
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
|
||||
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
|
||||
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("add finalizer to claim %s: %w", klog.KObj(claim), err)
|
||||
}
|
||||
claim = updatedClaim
|
||||
}
|
||||
claim.Status.Allocation = allocation
|
||||
}
|
||||
|
||||
// We can simply try to add the pod here without checking
|
||||
// preconditions. The apiserver will tell us with a
|
||||
// non-conflict error if this isn't possible.
|
||||
claim.Status.ReservedFor = append(claim.Status.ReservedFor, resourceapi.ResourceClaimConsumerReference{Resource: "pods", Name: pod.Name, UID: pod.UID})
|
||||
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
if allocation != nil {
|
||||
return fmt.Errorf("add allocation and reservation to claim %s: %w", klog.KObj(claim), err)
|
||||
}
|
||||
return fmt.Errorf("add reservation to claim %s: %w", klog.KObj(claim), err)
|
||||
}
|
||||
claim = updatedClaim
|
||||
return nil
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
return nil, retryErr
|
||||
}
|
||||
|
||||
logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim))
|
||||
return claim, nil
|
||||
}
|
||||
|
||||
// statusUnschedulable ensures that there is a log message associated with the
|
||||
// line where the status originated.
|
||||
func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) *framework.Status {
|
||||
if loggerV := logger.V(5); loggerV.Enabled() {
|
||||
helper, loggerV := loggerV.WithCallStackHelper()
|
||||
helper()
|
||||
kv = append(kv, "reason", reason)
|
||||
// nolint: logcheck // warns because it cannot check key/values
|
||||
loggerV.Info("pod unschedulable", kv...)
|
||||
}
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason)
|
||||
}
|
||||
|
||||
// statusError ensures that there is a log message associated with the
|
||||
// line where the error originated.
|
||||
func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status {
|
||||
if loggerV := logger.V(5); loggerV.Enabled() {
|
||||
helper, loggerV := loggerV.WithCallStackHelper()
|
||||
helper()
|
||||
// nolint: logcheck // warns because it cannot check key/values
|
||||
loggerV.Error(err, "dynamic resource plugin failed", kv...)
|
||||
}
|
||||
return framework.AsStatus(err)
|
||||
}
|
33
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature/feature.go
generated
vendored
Normal file
33
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature/feature.go
generated
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package feature
|
||||
|
||||
// Features carries feature gate values used by various plugins.
|
||||
// This struct allows us to break the dependency of the plugins on
|
||||
// the internal k8s features pkg.
|
||||
type Features struct {
|
||||
EnableDRAAdminAccess bool
|
||||
EnableDynamicResourceAllocation bool
|
||||
EnableVolumeCapacityPriority bool
|
||||
EnableNodeInclusionPolicyInPodTopologySpread bool
|
||||
EnableMatchLabelKeysInPodTopologySpread bool
|
||||
EnableInPlacePodVerticalScaling bool
|
||||
EnableSidecarContainers bool
|
||||
EnableSchedulingQueueHint bool
|
||||
EnableAsyncPreemption bool
|
||||
EnablePodLevelResources bool
|
||||
}
|
55
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/normalize_score.go
generated
vendored
Normal file
55
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/normalize_score.go
generated
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package helper
|
||||
|
||||
import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
// DefaultNormalizeScore generates a Normalize Score function that can normalize the
|
||||
// scores from [0, max(scores)] to [0, maxPriority]. If reverse is set to true, it
|
||||
// reverses the scores by subtracting it from maxPriority.
|
||||
// Note: The input scores are always assumed to be non-negative integers.
|
||||
func DefaultNormalizeScore(maxPriority int64, reverse bool, scores framework.NodeScoreList) *framework.Status {
|
||||
var maxCount int64
|
||||
for i := range scores {
|
||||
if scores[i].Score > maxCount {
|
||||
maxCount = scores[i].Score
|
||||
}
|
||||
}
|
||||
|
||||
if maxCount == 0 {
|
||||
if reverse {
|
||||
for i := range scores {
|
||||
scores[i].Score = maxPriority
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
for i := range scores {
|
||||
score := scores[i].Score
|
||||
|
||||
score = maxPriority * score / maxCount
|
||||
if reverse {
|
||||
score = maxPriority - score
|
||||
}
|
||||
|
||||
scores[i].Score = score
|
||||
}
|
||||
return nil
|
||||
}
|
52
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/shape_score.go
generated
vendored
Normal file
52
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/shape_score.go
generated
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package helper
|
||||
|
||||
// FunctionShape represents a collection of FunctionShapePoint.
|
||||
type FunctionShape []FunctionShapePoint
|
||||
|
||||
// FunctionShapePoint represents a shape point.
|
||||
type FunctionShapePoint struct {
|
||||
// Utilization is function argument.
|
||||
Utilization int64
|
||||
// Score is function value.
|
||||
Score int64
|
||||
}
|
||||
|
||||
// BuildBrokenLinearFunction creates a function which is built using linear segments. Segments are defined via shape array.
|
||||
// Shape[i].Utilization slice represents points on "Utilization" axis where different segments meet.
|
||||
// Shape[i].Score represents function values at meeting points.
|
||||
//
|
||||
// function f(p) is defined as:
|
||||
//
|
||||
// shape[0].Score for p < shape[0].Utilization
|
||||
// shape[n-1].Score for p > shape[n-1].Utilization
|
||||
//
|
||||
// and linear between points (p < shape[i].Utilization)
|
||||
func BuildBrokenLinearFunction(shape FunctionShape) func(int64) int64 {
|
||||
return func(p int64) int64 {
|
||||
for i := 0; i < len(shape); i++ {
|
||||
if p <= int64(shape[i].Utilization) {
|
||||
if i == 0 {
|
||||
return shape[0].Score
|
||||
}
|
||||
return shape[i-1].Score + (shape[i].Score-shape[i-1].Score)*(p-shape[i-1].Utilization)/(shape[i].Utilization-shape[i-1].Utilization)
|
||||
}
|
||||
}
|
||||
return shape[len(shape)-1].Score
|
||||
}
|
||||
}
|
116
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/spread.go
generated
vendored
Normal file
116
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/spread.go
generated
vendored
Normal file
@ -0,0 +1,116 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package helper
|
||||
|
||||
import (
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
appslisters "k8s.io/client-go/listers/apps/v1"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
)
|
||||
|
||||
var (
|
||||
rcKind = v1.SchemeGroupVersion.WithKind("ReplicationController")
|
||||
rsKind = appsv1.SchemeGroupVersion.WithKind("ReplicaSet")
|
||||
ssKind = appsv1.SchemeGroupVersion.WithKind("StatefulSet")
|
||||
)
|
||||
|
||||
// DefaultSelector returns a selector deduced from the Services, Replication
|
||||
// Controllers, Replica Sets, and Stateful Sets matching the given pod.
|
||||
func DefaultSelector(
|
||||
pod *v1.Pod,
|
||||
sl corelisters.ServiceLister,
|
||||
cl corelisters.ReplicationControllerLister,
|
||||
rsl appslisters.ReplicaSetLister,
|
||||
ssl appslisters.StatefulSetLister,
|
||||
) labels.Selector {
|
||||
labelSet := make(labels.Set)
|
||||
// Since services, RCs, RSs and SSs match the pod, they won't have conflicting
|
||||
// labels. Merging is safe.
|
||||
|
||||
if services, err := GetPodServices(sl, pod); err == nil {
|
||||
for _, service := range services {
|
||||
labelSet = labels.Merge(labelSet, service.Spec.Selector)
|
||||
}
|
||||
}
|
||||
selector := labelSet.AsSelector()
|
||||
|
||||
owner := metav1.GetControllerOfNoCopy(pod)
|
||||
if owner == nil {
|
||||
return selector
|
||||
}
|
||||
|
||||
gv, err := schema.ParseGroupVersion(owner.APIVersion)
|
||||
if err != nil {
|
||||
return selector
|
||||
}
|
||||
|
||||
gvk := gv.WithKind(owner.Kind)
|
||||
switch gvk {
|
||||
case rcKind:
|
||||
if rc, err := cl.ReplicationControllers(pod.Namespace).Get(owner.Name); err == nil {
|
||||
labelSet = labels.Merge(labelSet, rc.Spec.Selector)
|
||||
selector = labelSet.AsSelector()
|
||||
}
|
||||
case rsKind:
|
||||
if rs, err := rsl.ReplicaSets(pod.Namespace).Get(owner.Name); err == nil {
|
||||
if other, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector); err == nil {
|
||||
if r, ok := other.Requirements(); ok {
|
||||
selector = selector.Add(r...)
|
||||
}
|
||||
}
|
||||
}
|
||||
case ssKind:
|
||||
if ss, err := ssl.StatefulSets(pod.Namespace).Get(owner.Name); err == nil {
|
||||
if other, err := metav1.LabelSelectorAsSelector(ss.Spec.Selector); err == nil {
|
||||
if r, ok := other.Requirements(); ok {
|
||||
selector = selector.Add(r...)
|
||||
}
|
||||
}
|
||||
}
|
||||
default:
|
||||
// Not owned by a supported controller.
|
||||
}
|
||||
|
||||
return selector
|
||||
}
|
||||
|
||||
// GetPodServices gets the services that have the selector that match the labels on the given pod.
|
||||
func GetPodServices(sl corelisters.ServiceLister, pod *v1.Pod) ([]*v1.Service, error) {
|
||||
allServices, err := sl.Services(pod.Namespace).List(labels.Everything())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var services []*v1.Service
|
||||
for i := range allServices {
|
||||
service := allServices[i]
|
||||
if service.Spec.Selector == nil {
|
||||
// services with nil selectors match nothing, not everything.
|
||||
continue
|
||||
}
|
||||
selector := labels.Set(service.Spec.Selector).AsSelectorPreValidated()
|
||||
if selector.Matches(labels.Set(pod.Labels)) {
|
||||
services = append(services, service)
|
||||
}
|
||||
}
|
||||
|
||||
return services, nil
|
||||
}
|
28
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/taint.go
generated
vendored
Normal file
28
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper/taint.go
generated
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package helper
|
||||
|
||||
import v1 "k8s.io/api/core/v1"
|
||||
|
||||
// DoNotScheduleTaintsFilterFunc returns the filter function that can
|
||||
// filter out the node taints that reject scheduling Pod on a Node.
|
||||
func DoNotScheduleTaintsFilterFunc() func(t *v1.Taint) bool {
|
||||
return func(t *v1.Taint) bool {
|
||||
// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
|
||||
return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
|
||||
}
|
||||
}
|
132
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality/image_locality.go
generated
vendored
Normal file
132
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality/image_locality.go
generated
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package imagelocality
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
)
|
||||
|
||||
// The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for
|
||||
// container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range.
|
||||
const (
|
||||
mb int64 = 1024 * 1024
|
||||
minThreshold int64 = 23 * mb
|
||||
maxContainerThreshold int64 = 1000 * mb
|
||||
)
|
||||
|
||||
// ImageLocality is a score plugin that favors nodes that already have requested pod container's images.
|
||||
type ImageLocality struct {
|
||||
handle framework.Handle
|
||||
}
|
||||
|
||||
var _ framework.ScorePlugin = &ImageLocality{}
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.ImageLocality
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *ImageLocality) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
nodeInfos, err := pl.handle.SnapshotSharedLister().NodeInfos().List()
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
totalNumNodes := len(nodeInfos)
|
||||
|
||||
imageScores := sumImageScores(nodeInfo, pod, totalNumNodes)
|
||||
score := calculatePriority(imageScores, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
|
||||
|
||||
return score, nil
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *ImageLocality) ScoreExtensions() framework.ScoreExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, h framework.Handle) (framework.Plugin, error) {
|
||||
return &ImageLocality{handle: h}, nil
|
||||
}
|
||||
|
||||
// calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's
|
||||
// priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores.
|
||||
func calculatePriority(sumScores int64, numContainers int) int64 {
|
||||
maxThreshold := maxContainerThreshold * int64(numContainers)
|
||||
if sumScores < minThreshold {
|
||||
sumScores = minThreshold
|
||||
} else if sumScores > maxThreshold {
|
||||
sumScores = maxThreshold
|
||||
}
|
||||
|
||||
return framework.MaxNodeScore * (sumScores - minThreshold) / (maxThreshold - minThreshold)
|
||||
}
|
||||
|
||||
// sumImageScores returns the sum of image scores of all the containers that are already on the node.
|
||||
// Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate
|
||||
// the final score.
|
||||
func sumImageScores(nodeInfo *framework.NodeInfo, pod *v1.Pod, totalNumNodes int) int64 {
|
||||
var sum int64
|
||||
for _, container := range pod.Spec.InitContainers {
|
||||
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
|
||||
sum += scaledImageScore(state, totalNumNodes)
|
||||
}
|
||||
}
|
||||
for _, container := range pod.Spec.Containers {
|
||||
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
|
||||
sum += scaledImageScore(state, totalNumNodes)
|
||||
}
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// scaledImageScore returns an adaptively scaled score for the given state of an image.
|
||||
// The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to.
|
||||
// This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or
|
||||
// a few nodes due to image locality.
|
||||
func scaledImageScore(imageState *framework.ImageStateSummary, totalNumNodes int) int64 {
|
||||
spread := float64(imageState.NumNodes) / float64(totalNumNodes)
|
||||
return int64(float64(imageState.Size) * spread)
|
||||
}
|
||||
|
||||
// normalizedImageName returns the CRI compliant name for a given image.
|
||||
// TODO: cover the corner cases of missed matches, e.g,
|
||||
// 1. Using Docker as runtime and docker.io/library/test:tag in pod spec, but only test:tag will present in node status
|
||||
// 2. Using the implicit registry, i.e., test:tag or library/test:tag in pod spec but only docker.io/library/test:tag
|
||||
// in node status; note that if users consistently use one registry format, this should not happen.
|
||||
func normalizedImageName(name string) string {
|
||||
if strings.LastIndex(name, ":") <= strings.LastIndex(name, "/") {
|
||||
name = name + ":latest"
|
||||
}
|
||||
return name
|
||||
}
|
386
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/filtering.go
generated
vendored
Normal file
386
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/filtering.go
generated
vendored
Normal file
@ -0,0 +1,386 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package interpodaffinity
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync/atomic"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
const (
|
||||
// preFilterStateKey is the key in CycleState to InterPodAffinity pre-computed data for Filtering.
|
||||
// Using the name of the plugin will likely help us avoid collisions with other plugins.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// ErrReasonExistingAntiAffinityRulesNotMatch is used for ExistingPodsAntiAffinityRulesNotMatch predicate error.
|
||||
ErrReasonExistingAntiAffinityRulesNotMatch = "node(s) didn't satisfy existing pods anti-affinity rules"
|
||||
// ErrReasonAffinityRulesNotMatch is used for PodAffinityRulesNotMatch predicate error.
|
||||
ErrReasonAffinityRulesNotMatch = "node(s) didn't match pod affinity rules"
|
||||
// ErrReasonAntiAffinityRulesNotMatch is used for PodAntiAffinityRulesNotMatch predicate error.
|
||||
ErrReasonAntiAffinityRulesNotMatch = "node(s) didn't match pod anti-affinity rules"
|
||||
)
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
type preFilterState struct {
|
||||
// A map of topology pairs to the number of existing pods that has anti-affinity terms that match the "pod".
|
||||
existingAntiAffinityCounts topologyToMatchedTermCount
|
||||
// A map of topology pairs to the number of existing pods that match the affinity terms of the "pod".
|
||||
affinityCounts topologyToMatchedTermCount
|
||||
// A map of topology pairs to the number of existing pods that match the anti-affinity terms of the "pod".
|
||||
antiAffinityCounts topologyToMatchedTermCount
|
||||
// podInfo of the incoming pod.
|
||||
podInfo *framework.PodInfo
|
||||
// A copy of the incoming pod's namespace labels.
|
||||
namespaceLabels labels.Set
|
||||
}
|
||||
|
||||
// Clone the prefilter state.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := preFilterState{}
|
||||
copy.affinityCounts = s.affinityCounts.clone()
|
||||
copy.antiAffinityCounts = s.antiAffinityCounts.clone()
|
||||
copy.existingAntiAffinityCounts = s.existingAntiAffinityCounts.clone()
|
||||
// No need to deep copy the podInfo because it shouldn't change.
|
||||
copy.podInfo = s.podInfo
|
||||
copy.namespaceLabels = s.namespaceLabels
|
||||
return ©
|
||||
}
|
||||
|
||||
// updateWithPod updates the preFilterState counters with the (anti)affinity matches for the given podInfo.
|
||||
func (s *preFilterState) updateWithPod(pInfo *framework.PodInfo, node *v1.Node, multiplier int64) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
|
||||
s.existingAntiAffinityCounts.updateWithAntiAffinityTerms(pInfo.RequiredAntiAffinityTerms, s.podInfo.Pod, s.namespaceLabels, node, multiplier)
|
||||
s.affinityCounts.updateWithAffinityTerms(s.podInfo.RequiredAffinityTerms, pInfo.Pod, node, multiplier)
|
||||
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
|
||||
// here we don't lookup the updated pod's namespace labels, hence passing nil for nsLabels.
|
||||
s.antiAffinityCounts.updateWithAntiAffinityTerms(s.podInfo.RequiredAntiAffinityTerms, pInfo.Pod, nil, node, multiplier)
|
||||
}
|
||||
|
||||
type topologyPair struct {
|
||||
key string
|
||||
value string
|
||||
}
|
||||
type topologyToMatchedTermCount map[topologyPair]int64
|
||||
|
||||
func (m topologyToMatchedTermCount) append(toAppend topologyToMatchedTermCount) {
|
||||
for pair := range toAppend {
|
||||
m[pair] += toAppend[pair]
|
||||
}
|
||||
}
|
||||
|
||||
func (m topologyToMatchedTermCount) clone() topologyToMatchedTermCount {
|
||||
copy := make(topologyToMatchedTermCount, len(m))
|
||||
copy.append(m)
|
||||
return copy
|
||||
}
|
||||
|
||||
func (m topologyToMatchedTermCount) update(node *v1.Node, tk string, value int64) {
|
||||
if tv, ok := node.Labels[tk]; ok {
|
||||
pair := topologyPair{key: tk, value: tv}
|
||||
m[pair] += value
|
||||
// value could be negative, hence we delete the entry if it is down to zero.
|
||||
if m[pair] == 0 {
|
||||
delete(m, pair)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// updates the topologyToMatchedTermCount map with the specified value
|
||||
// for each affinity term if "targetPod" matches ALL terms.
|
||||
func (m topologyToMatchedTermCount) updateWithAffinityTerms(
|
||||
terms []framework.AffinityTerm, pod *v1.Pod, node *v1.Node, value int64) {
|
||||
if podMatchesAllAffinityTerms(terms, pod) {
|
||||
for _, t := range terms {
|
||||
m.update(node, t.TopologyKey, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// updates the topologyToMatchedTermCount map with the specified value
|
||||
// for each anti-affinity term matched the target pod.
|
||||
func (m topologyToMatchedTermCount) updateWithAntiAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, value int64) {
|
||||
// Check anti-affinity terms.
|
||||
for _, t := range terms {
|
||||
if t.Matches(pod, nsLabels) {
|
||||
m.update(node, t.TopologyKey, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns true IFF the given pod matches all the given terms.
|
||||
func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) bool {
|
||||
if len(terms) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, t := range terms {
|
||||
// The incoming pod NamespaceSelector was merged into the Namespaces set, and so
|
||||
// we are not explicitly passing in namespace labels.
|
||||
if !t.Matches(pod, nil) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// calculates the following for each existing pod on each node:
|
||||
// 1. Whether it has PodAntiAffinity
|
||||
// 2. Whether any AntiAffinityTerm matches the incoming pod
|
||||
func (pl *InterPodAffinity) getExistingAntiAffinityCounts(ctx context.Context, pod *v1.Pod, nsLabels labels.Set, nodes []*framework.NodeInfo) topologyToMatchedTermCount {
|
||||
topoMaps := make([]topologyToMatchedTermCount, len(nodes))
|
||||
index := int32(-1)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := nodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
topoMap := make(topologyToMatchedTermCount)
|
||||
for _, existingPod := range nodeInfo.PodsWithRequiredAntiAffinity {
|
||||
topoMap.updateWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
|
||||
}
|
||||
if len(topoMap) != 0 {
|
||||
topoMaps[atomic.AddInt32(&index, 1)] = topoMap
|
||||
}
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(nodes), processNode, pl.Name())
|
||||
|
||||
result := make(topologyToMatchedTermCount)
|
||||
for i := 0; i <= int(index); i++ {
|
||||
result.append(topoMaps[i])
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// finds existing Pods that match affinity terms of the incoming pod's (anti)affinity terms.
|
||||
// It returns a topologyToMatchedTermCount that are checked later by the affinity
|
||||
// predicate. With this topologyToMatchedTermCount available, the affinity predicate does not
|
||||
// need to check all the pods in the cluster.
|
||||
func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Context, podInfo *framework.PodInfo, allNodes []*framework.NodeInfo) (topologyToMatchedTermCount, topologyToMatchedTermCount) {
|
||||
affinityCounts := make(topologyToMatchedTermCount)
|
||||
antiAffinityCounts := make(topologyToMatchedTermCount)
|
||||
if len(podInfo.RequiredAffinityTerms) == 0 && len(podInfo.RequiredAntiAffinityTerms) == 0 {
|
||||
return affinityCounts, antiAffinityCounts
|
||||
}
|
||||
|
||||
affinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
|
||||
antiAffinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
|
||||
index := int32(-1)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
affinity := make(topologyToMatchedTermCount)
|
||||
antiAffinity := make(topologyToMatchedTermCount)
|
||||
for _, existingPod := range nodeInfo.Pods {
|
||||
affinity.updateWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
|
||||
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
|
||||
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
|
||||
antiAffinity.updateWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
|
||||
}
|
||||
|
||||
if len(affinity) > 0 || len(antiAffinity) > 0 {
|
||||
k := atomic.AddInt32(&index, 1)
|
||||
affinityCountsList[k] = affinity
|
||||
antiAffinityCountsList[k] = antiAffinity
|
||||
}
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
|
||||
|
||||
for i := 0; i <= int(index); i++ {
|
||||
affinityCounts.append(affinityCountsList[i])
|
||||
antiAffinityCounts.append(antiAffinityCountsList[i])
|
||||
}
|
||||
|
||||
return affinityCounts, antiAffinityCounts
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point.
|
||||
func (pl *InterPodAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
var allNodes []*framework.NodeInfo
|
||||
var nodesWithRequiredAntiAffinityPods []*framework.NodeInfo
|
||||
var err error
|
||||
if allNodes, err = pl.sharedLister.NodeInfos().List(); err != nil {
|
||||
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos: %w", err))
|
||||
}
|
||||
if nodesWithRequiredAntiAffinityPods, err = pl.sharedLister.NodeInfos().HavePodsWithRequiredAntiAffinityList(); err != nil {
|
||||
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos with pods with affinity: %w", err))
|
||||
}
|
||||
|
||||
s := &preFilterState{}
|
||||
|
||||
if s.podInfo, err = framework.NewPodInfo(pod); err != nil {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("parsing pod: %+v", err))
|
||||
}
|
||||
|
||||
for i := range s.podInfo.RequiredAffinityTerms {
|
||||
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAffinityTerms[i]); err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
for i := range s.podInfo.RequiredAntiAffinityTerms {
|
||||
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAntiAffinityTerms[i]); err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
s.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
|
||||
|
||||
s.existingAntiAffinityCounts = pl.getExistingAntiAffinityCounts(ctx, pod, s.namespaceLabels, nodesWithRequiredAntiAffinityPods)
|
||||
s.affinityCounts, s.antiAffinityCounts = pl.getIncomingAffinityAntiAffinityCounts(ctx, s.podInfo, allNodes)
|
||||
|
||||
if len(s.existingAntiAffinityCounts) == 0 && len(s.podInfo.RequiredAffinityTerms) == 0 && len(s.podInfo.RequiredAntiAffinityTerms) == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
cycleState.Write(preFilterStateKey, s)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *InterPodAffinity) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// AddPod from pre-computed data in cycleState.
|
||||
func (pl *InterPodAffinity) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
state.updateWithPod(podInfoToAdd, nodeInfo.Node(), 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemovePod from pre-computed data in cycleState.
|
||||
func (pl *InterPodAffinity) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
state.updateWithPod(podInfoToRemove, nodeInfo.Node(), -1)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
|
||||
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to interpodaffinity.state error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Checks if scheduling the pod onto this node would break any anti-affinity
|
||||
// terms indicated by the existing pods.
|
||||
func satisfyExistingPodsAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
|
||||
if len(state.existingAntiAffinityCounts) > 0 {
|
||||
// Iterate over topology pairs to get any of the pods being affected by
|
||||
// the scheduled pod anti-affinity terms
|
||||
for topologyKey, topologyValue := range nodeInfo.Node().Labels {
|
||||
tp := topologyPair{key: topologyKey, value: topologyValue}
|
||||
if state.existingAntiAffinityCounts[tp] > 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Checks if the node satisfies the incoming pod's anti-affinity rules.
|
||||
func satisfyPodAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
|
||||
if len(state.antiAffinityCounts) > 0 {
|
||||
for _, term := range state.podInfo.RequiredAntiAffinityTerms {
|
||||
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
|
||||
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
if state.antiAffinityCounts[tp] > 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Checks if the node satisfies the incoming pod's affinity rules.
|
||||
func satisfyPodAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
|
||||
podsExist := true
|
||||
for _, term := range state.podInfo.RequiredAffinityTerms {
|
||||
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
|
||||
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
if state.affinityCounts[tp] <= 0 {
|
||||
podsExist = false
|
||||
}
|
||||
} else {
|
||||
// All topology labels must exist on the node.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if !podsExist {
|
||||
// This pod may be the first pod in a series that have affinity to themselves. In order
|
||||
// to not leave such pods in pending state forever, we check that if no other pod
|
||||
// in the cluster matches the namespace and selector of this pod, the pod matches
|
||||
// its own terms, and the node has all the requested topologies, then we allow the pod
|
||||
// to pass the affinity check.
|
||||
if len(state.affinityCounts) == 0 && podMatchesAllAffinityTerms(state.podInfo.RequiredAffinityTerms, state.podInfo.Pod) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
// It checks if a pod can be scheduled on the specified node with pod affinity/anti-affinity configuration.
|
||||
func (pl *InterPodAffinity) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
if !satisfyPodAffinity(state, nodeInfo) {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonAffinityRulesNotMatch)
|
||||
}
|
||||
|
||||
if !satisfyPodAntiAffinity(state, nodeInfo) {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonAntiAffinityRulesNotMatch)
|
||||
}
|
||||
|
||||
if !satisfyExistingPodsAntiAffinity(state, nodeInfo) {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonExistingAntiAffinityRulesNotMatch)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
247
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go
generated
vendored
Normal file
247
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go
generated
vendored
Normal file
@ -0,0 +1,247 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package interpodaffinity
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
listersv1 "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.InterPodAffinity
|
||||
|
||||
var _ framework.PreFilterPlugin = &InterPodAffinity{}
|
||||
var _ framework.FilterPlugin = &InterPodAffinity{}
|
||||
var _ framework.PreScorePlugin = &InterPodAffinity{}
|
||||
var _ framework.ScorePlugin = &InterPodAffinity{}
|
||||
var _ framework.EnqueueExtensions = &InterPodAffinity{}
|
||||
|
||||
// InterPodAffinity is a plugin that checks inter pod affinity
|
||||
type InterPodAffinity struct {
|
||||
parallelizer parallelize.Parallelizer
|
||||
args config.InterPodAffinityArgs
|
||||
sharedLister framework.SharedLister
|
||||
nsLister listersv1.NamespaceLister
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *InterPodAffinity) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a failed Pod
|
||||
// schedulable
|
||||
func (pl *InterPodAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A note about UpdateNodeTaint event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeTaint event.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeLabel
|
||||
}
|
||||
return []framework.ClusterEventWithHint{
|
||||
// All ActionType includes the following events:
|
||||
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's anti-affinity constraints,
|
||||
// deleting an existing Pod may make it schedulable.
|
||||
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
|
||||
// an unschedulable Pod schedulable.
|
||||
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
|
||||
// adding an assigned Pod may make it schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.UpdatePodLabel | framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
if h.SnapshotSharedLister() == nil {
|
||||
return nil, fmt.Errorf("SnapshotSharedlister is nil")
|
||||
}
|
||||
args, err := getArgs(plArgs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := validation.ValidateInterPodAffinityArgs(nil, &args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pl := &InterPodAffinity{
|
||||
parallelizer: h.Parallelizer(),
|
||||
args: args,
|
||||
sharedLister: h.SnapshotSharedLister(),
|
||||
nsLister: h.SharedInformerFactory().Core().V1().Namespaces().Lister(),
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}
|
||||
|
||||
return pl, nil
|
||||
}
|
||||
|
||||
func getArgs(obj runtime.Object) (config.InterPodAffinityArgs, error) {
|
||||
ptr, ok := obj.(*config.InterPodAffinityArgs)
|
||||
if !ok {
|
||||
return config.InterPodAffinityArgs{}, fmt.Errorf("want args to be of type InterPodAffinityArgs, got %T", obj)
|
||||
}
|
||||
return *ptr, nil
|
||||
}
|
||||
|
||||
// Updates Namespaces with the set of namespaces identified by NamespaceSelector.
|
||||
// If successful, NamespaceSelector is set to nil.
|
||||
// The assumption is that the term is for an incoming pod, in which case
|
||||
// namespaceSelector is either unrolled into Namespaces (and so the selector
|
||||
// is set to Nothing()) or is Empty(), which means match everything. Therefore,
|
||||
// there when matching against this term, there is no need to lookup the existing
|
||||
// pod's namespace labels to match them against term's namespaceSelector explicitly.
|
||||
func (pl *InterPodAffinity) mergeAffinityTermNamespacesIfNotEmpty(at *framework.AffinityTerm) error {
|
||||
if at.NamespaceSelector.Empty() {
|
||||
return nil
|
||||
}
|
||||
ns, err := pl.nsLister.List(at.NamespaceSelector)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, n := range ns {
|
||||
at.Namespaces.Insert(n.Name)
|
||||
}
|
||||
at.NamespaceSelector = labels.Nothing()
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetNamespaceLabelsSnapshot returns a snapshot of the labels associated with
|
||||
// the namespace.
|
||||
func GetNamespaceLabelsSnapshot(logger klog.Logger, ns string, nsLister listersv1.NamespaceLister) (nsLabels labels.Set) {
|
||||
podNS, err := nsLister.Get(ns)
|
||||
if err == nil {
|
||||
// Create and return snapshot of the labels.
|
||||
return labels.Merge(podNS.Labels, nil)
|
||||
}
|
||||
logger.V(3).Info("getting namespace, assuming empty set of namespace labels", "namespace", ns, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
if (modifiedPod != nil && modifiedPod.Spec.NodeName == "") || (originalPod != nil && originalPod.Spec.NodeName == "") {
|
||||
logger.V(5).Info("the added/updated/deleted pod is unscheduled, so it doesn't make the target pod schedulable",
|
||||
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// Pod is updated. Return Queue when the updated pod matching the target pod's affinity or not matching anti-affinity.
|
||||
// Note that, we don't need to check each affinity individually when the Pod has more than one affinity
|
||||
// because the current PodAffinity looks for a **single** existing pod that can satisfy **all** the terms of inter-pod affinity of an incoming pod.
|
||||
if modifiedPod != nil && originalPod != nil {
|
||||
if !podMatchesAllAffinityTerms(terms, originalPod) && podMatchesAllAffinityTerms(terms, modifiedPod) {
|
||||
logger.V(5).Info("a scheduled pod was updated to match the target pod's affinity, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
if podMatchesAllAffinityTerms(antiTerms, originalPod) && !podMatchesAllAffinityTerms(antiTerms, modifiedPod) {
|
||||
logger.V(5).Info("a scheduled pod was updated not to match the target pod's anti affinity, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
logger.V(5).Info("a scheduled pod was updated but it doesn't match the target pod's affinity or does match the target pod's anti-affinity",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Pod is added. Return Queue when the added pod matching the target pod's affinity.
|
||||
if modifiedPod != nil {
|
||||
if podMatchesAllAffinityTerms(terms, modifiedPod) {
|
||||
logger.V(5).Info("a scheduled pod was added and it matches the target pod's affinity",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
logger.V(5).Info("a scheduled pod was added and it doesn't match the target pod's affinity",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Pod is deleted. Return Queue when the deleted pod matching the target pod's anti-affinity.
|
||||
if !podMatchesAllAffinityTerms(antiTerms, originalPod) {
|
||||
logger.V(5).Info("a scheduled pod was deleted but it doesn't match the target pod's anti-affinity",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
logger.V(5).Info("a scheduled pod was deleted and it matches the target pod's anti-affinity. The pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
for _, term := range terms {
|
||||
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
|
||||
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, err
|
||||
}
|
||||
}
|
||||
|
||||
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
for _, term := range antiTerms {
|
||||
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
|
||||
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, err
|
||||
}
|
||||
}
|
||||
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
302
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/scoring.go
generated
vendored
Normal file
302
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/scoring.go
generated
vendored
Normal file
@ -0,0 +1,302 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package interpodaffinity
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync/atomic"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
// preScoreStateKey is the key in CycleState to InterPodAffinity pre-computed data for Scoring.
|
||||
const preScoreStateKey = "PreScore" + Name
|
||||
|
||||
type scoreMap map[string]map[string]int64
|
||||
|
||||
// preScoreState computed at PreScore and used at Score.
|
||||
type preScoreState struct {
|
||||
topologyScore scoreMap
|
||||
podInfo *framework.PodInfo
|
||||
// A copy of the incoming pod's namespace labels.
|
||||
namespaceLabels labels.Set
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *preScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
func (m scoreMap) processTerm(term *framework.AffinityTerm, weight int32, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
|
||||
if term.Matches(pod, nsLabels) {
|
||||
if tpValue, tpValueExist := node.Labels[term.TopologyKey]; tpValueExist {
|
||||
if m[term.TopologyKey] == nil {
|
||||
m[term.TopologyKey] = make(map[string]int64)
|
||||
}
|
||||
m[term.TopologyKey][tpValue] += int64(weight * multiplier)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m scoreMap) processTerms(terms []framework.WeightedAffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
|
||||
for _, term := range terms {
|
||||
m.processTerm(&term.AffinityTerm, term.Weight, pod, nsLabels, node, multiplier)
|
||||
}
|
||||
}
|
||||
|
||||
func (m scoreMap) append(other scoreMap) {
|
||||
for topology, oScores := range other {
|
||||
scores := m[topology]
|
||||
if scores == nil {
|
||||
m[topology] = oScores
|
||||
continue
|
||||
}
|
||||
for k, v := range oScores {
|
||||
scores[k] += v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (pl *InterPodAffinity) processExistingPod(
|
||||
state *preScoreState,
|
||||
existingPod *framework.PodInfo,
|
||||
existingPodNodeInfo *framework.NodeInfo,
|
||||
incomingPod *v1.Pod,
|
||||
topoScore scoreMap,
|
||||
) {
|
||||
existingPodNode := existingPodNodeInfo.Node()
|
||||
if len(existingPodNode.Labels) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// For every soft pod affinity term of <pod>, if <existingPod> matches the term,
|
||||
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
|
||||
// value as that of <existingPods>`s node by the term`s weight.
|
||||
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
|
||||
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
|
||||
topoScore.processTerms(state.podInfo.PreferredAffinityTerms, existingPod.Pod, nil, existingPodNode, 1)
|
||||
|
||||
// For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term,
|
||||
// decrement <p.counts> for every node in the cluster with the same <term.TopologyKey>
|
||||
// value as that of <existingPod>`s node by the term`s weight.
|
||||
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
|
||||
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
|
||||
topoScore.processTerms(state.podInfo.PreferredAntiAffinityTerms, existingPod.Pod, nil, existingPodNode, -1)
|
||||
|
||||
// For every hard pod affinity term of <existingPod>, if <pod> matches the term,
|
||||
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
|
||||
// value as that of <existingPod>'s node by the constant <args.hardPodAffinityWeight>
|
||||
if pl.args.HardPodAffinityWeight > 0 && len(existingPodNode.Labels) != 0 {
|
||||
for _, t := range existingPod.RequiredAffinityTerms {
|
||||
topoScore.processTerm(&t, pl.args.HardPodAffinityWeight, incomingPod, state.namespaceLabels, existingPodNode, 1)
|
||||
}
|
||||
}
|
||||
|
||||
// For every soft pod affinity term of <existingPod>, if <pod> matches the term,
|
||||
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
|
||||
// value as that of <existingPod>'s node by the term's weight.
|
||||
topoScore.processTerms(existingPod.PreferredAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, 1)
|
||||
|
||||
// For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term,
|
||||
// decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey>
|
||||
// value as that of <existingPod>'s node by the term's weight.
|
||||
topoScore.processTerms(existingPod.PreferredAntiAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, -1)
|
||||
}
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *InterPodAffinity) PreScore(
|
||||
pCtx context.Context,
|
||||
cycleState *framework.CycleState,
|
||||
pod *v1.Pod,
|
||||
nodes []*framework.NodeInfo,
|
||||
) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
// No nodes to score.
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
if pl.sharedLister == nil {
|
||||
return framework.NewStatus(framework.Error, "empty shared lister in InterPodAffinity PreScore")
|
||||
}
|
||||
|
||||
affinity := pod.Spec.Affinity
|
||||
hasPreferredAffinityConstraints := affinity != nil && affinity.PodAffinity != nil && len(affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
|
||||
hasPreferredAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil && len(affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
|
||||
hasConstraints := hasPreferredAffinityConstraints || hasPreferredAntiAffinityConstraints
|
||||
|
||||
// Optionally ignore calculating preferences of existing pods' affinity rules
|
||||
// if the incoming pod has no inter-pod affinities.
|
||||
if pl.args.IgnorePreferredTermsOfExistingPods && !hasConstraints {
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// Unless the pod being scheduled has preferred affinity terms, we only
|
||||
// need to process nodes hosting pods with affinity.
|
||||
var allNodes []*framework.NodeInfo
|
||||
var err error
|
||||
if hasConstraints {
|
||||
allNodes, err = pl.sharedLister.NodeInfos().List()
|
||||
if err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("failed to get all nodes from shared lister: %w", err))
|
||||
}
|
||||
} else {
|
||||
allNodes, err = pl.sharedLister.NodeInfos().HavePodsWithAffinityList()
|
||||
if err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("failed to get pods with affinity list: %w", err))
|
||||
}
|
||||
}
|
||||
|
||||
state := &preScoreState{
|
||||
topologyScore: make(map[string]map[string]int64),
|
||||
}
|
||||
|
||||
if state.podInfo, err = framework.NewPodInfo(pod); err != nil {
|
||||
// Ideally we never reach here, because errors will be caught by PreFilter
|
||||
return framework.AsStatus(fmt.Errorf("failed to parse pod: %w", err))
|
||||
}
|
||||
|
||||
for i := range state.podInfo.PreferredAffinityTerms {
|
||||
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAffinityTerms[i].AffinityTerm); err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("updating PreferredAffinityTerms: %w", err))
|
||||
}
|
||||
}
|
||||
for i := range state.podInfo.PreferredAntiAffinityTerms {
|
||||
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAntiAffinityTerms[i].AffinityTerm); err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("updating PreferredAntiAffinityTerms: %w", err))
|
||||
}
|
||||
}
|
||||
logger := klog.FromContext(pCtx)
|
||||
state.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
|
||||
|
||||
topoScores := make([]scoreMap, len(allNodes))
|
||||
index := int32(-1)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
|
||||
// Unless the pod being scheduled has preferred affinity terms, we only
|
||||
// need to process pods with affinity in the node.
|
||||
podsToProcess := nodeInfo.PodsWithAffinity
|
||||
if hasConstraints {
|
||||
// We need to process all the pods.
|
||||
podsToProcess = nodeInfo.Pods
|
||||
}
|
||||
|
||||
topoScore := make(scoreMap)
|
||||
for _, existingPod := range podsToProcess {
|
||||
pl.processExistingPod(state, existingPod, nodeInfo, pod, topoScore)
|
||||
}
|
||||
if len(topoScore) > 0 {
|
||||
topoScores[atomic.AddInt32(&index, 1)] = topoScore
|
||||
}
|
||||
}
|
||||
pl.parallelizer.Until(pCtx, len(allNodes), processNode, pl.Name())
|
||||
|
||||
if index == -1 {
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
for i := 0; i <= int(index); i++ {
|
||||
state.topologyScore.append(topoScores[i])
|
||||
}
|
||||
|
||||
cycleState.Write(preScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
|
||||
c, err := cycleState.Read(preScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to interpodaffinity.preScoreState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
// The "score" returned in this function is the sum of weights got from cycleState which have its topologyKey matching with the node's labels.
|
||||
// it is normalized later.
|
||||
// Note: the returned "score" is positive for pod-affinity, and negative for pod-antiaffinity.
|
||||
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("failed to get node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
node := nodeInfo.Node()
|
||||
|
||||
s, err := getPreScoreState(cycleState)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
var score int64
|
||||
for tpKey, tpValues := range s.topologyScore {
|
||||
if v, exist := node.Labels[tpKey]; exist {
|
||||
score += tpValues[v]
|
||||
}
|
||||
}
|
||||
|
||||
return score, nil
|
||||
}
|
||||
|
||||
// NormalizeScore normalizes the score for each filteredNode.
|
||||
func (pl *InterPodAffinity) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
|
||||
s, err := getPreScoreState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if len(s.topologyScore) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var minCount int64 = math.MaxInt64
|
||||
var maxCount int64 = math.MinInt64
|
||||
for i := range scores {
|
||||
score := scores[i].Score
|
||||
if score > maxCount {
|
||||
maxCount = score
|
||||
}
|
||||
if score < minCount {
|
||||
minCount = score
|
||||
}
|
||||
}
|
||||
|
||||
maxMinDiff := maxCount - minCount
|
||||
for i := range scores {
|
||||
fScore := float64(0)
|
||||
if maxMinDiff > 0 {
|
||||
fScore = float64(framework.MaxNodeScore) * (float64(scores[i].Score-minCount) / float64(maxMinDiff))
|
||||
}
|
||||
|
||||
scores[i].Score = int64(fScore)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *InterPodAffinity) ScoreExtensions() framework.ScoreExtensions {
|
||||
return pl
|
||||
}
|
39
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/names/names.go
generated
vendored
Normal file
39
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/names/names.go
generated
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package names
|
||||
|
||||
const (
|
||||
PrioritySort = "PrioritySort"
|
||||
DefaultBinder = "DefaultBinder"
|
||||
DefaultPreemption = "DefaultPreemption"
|
||||
DynamicResources = "DynamicResources"
|
||||
ImageLocality = "ImageLocality"
|
||||
InterPodAffinity = "InterPodAffinity"
|
||||
NodeAffinity = "NodeAffinity"
|
||||
NodeName = "NodeName"
|
||||
NodePorts = "NodePorts"
|
||||
NodeResourcesBalancedAllocation = "NodeResourcesBalancedAllocation"
|
||||
NodeResourcesFit = "NodeResourcesFit"
|
||||
NodeUnschedulable = "NodeUnschedulable"
|
||||
NodeVolumeLimits = "NodeVolumeLimits"
|
||||
PodTopologySpread = "PodTopologySpread"
|
||||
SchedulingGates = "SchedulingGates"
|
||||
TaintToleration = "TaintToleration"
|
||||
VolumeBinding = "VolumeBinding"
|
||||
VolumeRestrictions = "VolumeRestrictions"
|
||||
VolumeZone = "VolumeZone"
|
||||
)
|
372
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go
generated
vendored
Normal file
372
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity/node_affinity.go
generated
vendored
Normal file
@ -0,0 +1,372 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeaffinity
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// NodeAffinity is a plugin that checks if a pod node selector matches the node label.
|
||||
type NodeAffinity struct {
|
||||
handle framework.Handle
|
||||
addedNodeSelector *nodeaffinity.NodeSelector
|
||||
addedPrefSchedTerms *nodeaffinity.PreferredSchedulingTerms
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &NodeAffinity{}
|
||||
var _ framework.FilterPlugin = &NodeAffinity{}
|
||||
var _ framework.PreScorePlugin = &NodeAffinity{}
|
||||
var _ framework.ScorePlugin = &NodeAffinity{}
|
||||
var _ framework.EnqueueExtensions = &NodeAffinity{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.NodeAffinity
|
||||
|
||||
// preScoreStateKey is the key in CycleState to NodeAffinity pre-computed data for Scoring.
|
||||
preScoreStateKey = "PreScore" + Name
|
||||
|
||||
// preFilterStateKey is the key in CycleState to NodeAffinity pre-compute data for Filtering.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// ErrReasonPod is the reason for Pod's node affinity/selector not matching.
|
||||
ErrReasonPod = "node(s) didn't match Pod's node affinity/selector"
|
||||
|
||||
// errReasonEnforced is the reason for added node affinity not matching.
|
||||
errReasonEnforced = "node(s) didn't match scheduler-enforced node affinity"
|
||||
|
||||
// errReasonConflict is the reason for pod's conflicting affinity rules.
|
||||
errReasonConflict = "pod affinity terms conflict"
|
||||
)
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *NodeAffinity) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
type preFilterState struct {
|
||||
requiredNodeSelectorAndAffinity nodeaffinity.RequiredNodeAffinity
|
||||
}
|
||||
|
||||
// Clone just returns the same state because it is not affected by pod additions or deletions.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A note about UpdateNodeTaint event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled, and hence we can use UpdateNodeLabel instead of Update.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeLabel
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
|
||||
// that change made a previously unschedulable pod schedulable.
|
||||
func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(modifiedNode) {
|
||||
logger.V(4).Info("added or modified node didn't match scheduler-enforced node affinity and this event won't make the Pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
|
||||
isMatched, err := requiredNodeAffinity.Match(modifiedNode)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
if !isMatched {
|
||||
logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
|
||||
if originalNode == nil {
|
||||
logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
|
||||
// (necessary affinity label was added to the node in this case).
|
||||
wasMatched, err := requiredNodeAffinity.Match(originalNode)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
if wasMatched {
|
||||
logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// PreFilter builds and writes cycle state used by Filter.
|
||||
func (pl *NodeAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
affinity := pod.Spec.Affinity
|
||||
noNodeAffinity := (affinity == nil ||
|
||||
affinity.NodeAffinity == nil ||
|
||||
affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil)
|
||||
if noNodeAffinity && pl.addedNodeSelector == nil && pod.Spec.NodeSelector == nil {
|
||||
// NodeAffinity Filter has nothing to do with the Pod.
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
state := &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
|
||||
cycleState.Write(preFilterStateKey, state)
|
||||
|
||||
if noNodeAffinity || len(affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Check if there is affinity to a specific node and return it.
|
||||
terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
|
||||
var nodeNames sets.Set[string]
|
||||
for _, t := range terms {
|
||||
var termNodeNames sets.Set[string]
|
||||
for _, r := range t.MatchFields {
|
||||
if r.Key == metav1.ObjectNameField && r.Operator == v1.NodeSelectorOpIn {
|
||||
// The requirements represent ANDed constraints, and so we need to
|
||||
// find the intersection of nodes.
|
||||
s := sets.New(r.Values...)
|
||||
if termNodeNames == nil {
|
||||
termNodeNames = s
|
||||
} else {
|
||||
termNodeNames = termNodeNames.Intersection(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
if termNodeNames == nil {
|
||||
// If this term has no node.Name field affinity,
|
||||
// then all nodes are eligible because the terms are ORed.
|
||||
return nil, nil
|
||||
}
|
||||
nodeNames = nodeNames.Union(termNodeNames)
|
||||
}
|
||||
// If nodeNames is not nil, but length is 0, it means each term have conflicting affinity to node.Name;
|
||||
// therefore, pod will not match any node.
|
||||
if nodeNames != nil && len(nodeNames) == 0 {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonConflict)
|
||||
} else if len(nodeNames) > 0 {
|
||||
return &framework.PreFilterResult{NodeNames: nodeNames}, nil
|
||||
}
|
||||
return nil, nil
|
||||
|
||||
}
|
||||
|
||||
// PreFilterExtensions not necessary for this plugin as state doesn't depend on pod additions or deletions.
|
||||
func (pl *NodeAffinity) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Filter checks if the Node matches the Pod .spec.affinity.nodeAffinity and
|
||||
// the plugin's added affinity.
|
||||
func (pl *NodeAffinity) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(node) {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonEnforced)
|
||||
}
|
||||
|
||||
s, err := getPreFilterState(state)
|
||||
if err != nil {
|
||||
// Fallback to calculate requiredNodeSelector and requiredNodeAffinity
|
||||
// here when PreFilter is disabled.
|
||||
s = &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
|
||||
}
|
||||
|
||||
// Ignore parsing errors for backwards compatibility.
|
||||
match, _ := s.requiredNodeSelectorAndAffinity.Match(node)
|
||||
if !match {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonPod)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// preScoreState computed at PreScore and used at Score.
|
||||
type preScoreState struct {
|
||||
preferredNodeAffinity *nodeaffinity.PreferredSchedulingTerms
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *preScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if preferredNodeAffinity == nil && pl.addedPrefSchedTerms == nil {
|
||||
// NodeAffinity Score has nothing to do with the Pod.
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
state := &preScoreState{
|
||||
preferredNodeAffinity: preferredNodeAffinity,
|
||||
}
|
||||
cycleState.Write(preScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Score returns the sum of the weights of the terms that match the Node.
|
||||
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
|
||||
// default affinity.
|
||||
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
node := nodeInfo.Node()
|
||||
|
||||
var count int64
|
||||
if pl.addedPrefSchedTerms != nil {
|
||||
count += pl.addedPrefSchedTerms.Score(node)
|
||||
}
|
||||
|
||||
s, err := getPreScoreState(state)
|
||||
if err != nil {
|
||||
// Fallback to calculate preferredNodeAffinity here when PreScore is disabled.
|
||||
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
s = &preScoreState{
|
||||
preferredNodeAffinity: preferredNodeAffinity,
|
||||
}
|
||||
}
|
||||
|
||||
if s.preferredNodeAffinity != nil {
|
||||
count += s.preferredNodeAffinity.Score(node)
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// NormalizeScore invoked after scoring all nodes.
|
||||
func (pl *NodeAffinity) NormalizeScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
|
||||
return helper.DefaultNormalizeScore(framework.MaxNodeScore, false, scores)
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *NodeAffinity) ScoreExtensions() framework.ScoreExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
args, err := getArgs(plArgs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pl := &NodeAffinity{
|
||||
handle: h,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}
|
||||
if args.AddedAffinity != nil {
|
||||
if ns := args.AddedAffinity.RequiredDuringSchedulingIgnoredDuringExecution; ns != nil {
|
||||
pl.addedNodeSelector, err = nodeaffinity.NewNodeSelector(ns)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing addedAffinity.requiredDuringSchedulingIgnoredDuringExecution: %w", err)
|
||||
}
|
||||
}
|
||||
// TODO: parse requiredDuringSchedulingRequiredDuringExecution when it gets added to the API.
|
||||
if terms := args.AddedAffinity.PreferredDuringSchedulingIgnoredDuringExecution; len(terms) != 0 {
|
||||
pl.addedPrefSchedTerms, err = nodeaffinity.NewPreferredSchedulingTerms(terms)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing addedAffinity.preferredDuringSchedulingIgnoredDuringExecution: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return pl, nil
|
||||
}
|
||||
|
||||
func getArgs(obj runtime.Object) (config.NodeAffinityArgs, error) {
|
||||
ptr, ok := obj.(*config.NodeAffinityArgs)
|
||||
if !ok {
|
||||
return config.NodeAffinityArgs{}, fmt.Errorf("args are not of type NodeAffinityArgs, got %T", obj)
|
||||
}
|
||||
return *ptr, validation.ValidateNodeAffinityArgs(nil, ptr)
|
||||
}
|
||||
|
||||
func getPodPreferredNodeAffinity(pod *v1.Pod) (*nodeaffinity.PreferredSchedulingTerms, error) {
|
||||
affinity := pod.Spec.Affinity
|
||||
if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil {
|
||||
return nodeaffinity.NewPreferredSchedulingTerms(affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution)
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
|
||||
c, err := cycleState.Read(preScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %v", preFilterStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("invalid PreFilter state, got type %T", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
89
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename/node_name.go
generated
vendored
Normal file
89
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename/node_name.go
generated
vendored
Normal file
@ -0,0 +1,89 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodename
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
)
|
||||
|
||||
// NodeName is a plugin that checks if a pod spec node name matches the current node.
|
||||
type NodeName struct {
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.FilterPlugin = &NodeName{}
|
||||
var _ framework.EnqueueExtensions = &NodeName{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.NodeName
|
||||
|
||||
// ErrReason returned when node name doesn't match.
|
||||
ErrReason = "node(s) didn't match the requested node name"
|
||||
)
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *NodeName) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A note about UpdateNodeTaint/UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
|
||||
nodeActionType = framework.Add
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
|
||||
// (the same as Queue)
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *NodeName) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *NodeName) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
|
||||
if !Fits(pod, nodeInfo) {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReason)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fits actually checks if the pod fits the node.
|
||||
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
|
||||
return len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == nodeInfo.Node().Name
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
return &NodeName{
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
215
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports/node_ports.go
generated
vendored
Normal file
215
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports/node_ports.go
generated
vendored
Normal file
@ -0,0 +1,215 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeports
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// NodePorts is a plugin that checks if a node has free ports for the requested pod ports.
|
||||
type NodePorts struct {
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &NodePorts{}
|
||||
var _ framework.FilterPlugin = &NodePorts{}
|
||||
var _ framework.EnqueueExtensions = &NodePorts{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.NodePorts
|
||||
|
||||
// preFilterStateKey is the key in CycleState to NodePorts pre-computed data.
|
||||
// Using the name of the plugin will likely help us avoid collisions with other plugins.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// ErrReason when node ports aren't available.
|
||||
ErrReason = "node(s) didn't have free ports for the requested pod ports"
|
||||
)
|
||||
|
||||
type preFilterState []*v1.ContainerPort
|
||||
|
||||
// Clone the prefilter state.
|
||||
func (s preFilterState) Clone() framework.StateData {
|
||||
// The state is not impacted by adding/removing existing pods, hence we don't need to make a deep copy.
|
||||
return s
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *NodePorts) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// getContainerPorts returns the used host ports of Pods: if 'port' was used, a 'port:true' pair
|
||||
// will be in the result; but it does not resolve port conflict.
|
||||
func getContainerPorts(pods ...*v1.Pod) []*v1.ContainerPort {
|
||||
ports := []*v1.ContainerPort{}
|
||||
for _, pod := range pods {
|
||||
for j := range pod.Spec.Containers {
|
||||
container := &pod.Spec.Containers[j]
|
||||
for k := range container.Ports {
|
||||
// Only return ports with a host port specified.
|
||||
if container.Ports[k].HostPort <= 0 {
|
||||
continue
|
||||
}
|
||||
ports = append(ports, &container.Ports[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
return ports
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point.
|
||||
func (pl *NodePorts) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
s := getContainerPorts(pod)
|
||||
// Skip if a pod has no ports.
|
||||
if len(s) == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
cycleState.Write(preFilterStateKey, preFilterState(s))
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions do not exist for this plugin.
|
||||
func (pl *NodePorts) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to nodeports.preFilterState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *NodePorts) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A note about UpdateNodeTaint/UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add because NodeUpdated event never means to have any free ports for the Pod.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
|
||||
nodeActionType = framework.Add
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// Due to immutable fields `spec.containers[*].ports`, pod update events are ignored.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
|
||||
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
|
||||
// (the same as Queue)
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted. It checks whether
|
||||
// that change made a previously unschedulable pod schedulable.
|
||||
func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
deletedPod, _, err := util.As[*v1.Pod](oldObj, nil)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// If the deleted pod is unscheduled, it doesn't make the target pod schedulable.
|
||||
if deletedPod.Spec.NodeName == "" {
|
||||
logger.V(4).Info("the deleted pod is unscheduled and it doesn't make the target pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Get the used host ports of the deleted pod.
|
||||
usedPorts := make(framework.HostPortInfo)
|
||||
for _, container := range deletedPod.Spec.Containers {
|
||||
for _, podPort := range container.Ports {
|
||||
if podPort.HostPort > 0 {
|
||||
usedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the deleted pod doesn't use any host ports, it doesn't make the target pod schedulable.
|
||||
if len(usedPorts) == 0 {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Construct a fake NodeInfo that only has the deleted Pod.
|
||||
// If we can schedule `pod` to this fake node, it means that `pod` and the deleted pod don't have any common port(s).
|
||||
// So, deleting that pod couldn't make `pod` schedulable.
|
||||
nodeInfo := framework.NodeInfo{UsedPorts: usedPorts}
|
||||
if Fits(pod, &nodeInfo) {
|
||||
logger.V(4).Info("the deleted pod and the target pod don't have any common port(s), returning QueueSkip as deleting this Pod won't make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(4).Info("the deleted pod and the target pod have any common port(s), returning Queue as deleting this Pod may make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *NodePorts) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
wantPorts, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
fits := fitsPorts(wantPorts, nodeInfo)
|
||||
if !fits {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReason)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fits checks if the pod fits the node.
|
||||
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
|
||||
return fitsPorts(getContainerPorts(pod), nodeInfo)
|
||||
}
|
||||
|
||||
func fitsPorts(wantPorts []*v1.ContainerPort, nodeInfo *framework.NodeInfo) bool {
|
||||
// try to see whether existingPorts and wantPorts will conflict or not
|
||||
existingPorts := nodeInfo.UsedPorts
|
||||
for _, cp := range wantPorts {
|
||||
if existingPorts.CheckConflict(cp.HostIP, string(cp.Protocol), cp.HostPort) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
return &NodePorts{
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
173
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go
generated
vendored
Normal file
173
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/balanced_allocation.go
generated
vendored
Normal file
@ -0,0 +1,173 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
)
|
||||
|
||||
// BalancedAllocation is a score plugin that calculates the difference between the cpu and memory fraction
|
||||
// of capacity, and prioritizes the host based on how close the two metrics are to each other.
|
||||
type BalancedAllocation struct {
|
||||
handle framework.Handle
|
||||
resourceAllocationScorer
|
||||
}
|
||||
|
||||
var _ framework.PreScorePlugin = &BalancedAllocation{}
|
||||
var _ framework.ScorePlugin = &BalancedAllocation{}
|
||||
|
||||
// BalancedAllocationName is the name of the plugin used in the plugin registry and configurations.
|
||||
const (
|
||||
BalancedAllocationName = names.NodeResourcesBalancedAllocation
|
||||
|
||||
// balancedAllocationPreScoreStateKey is the key in CycleState to NodeResourcesBalancedAllocation pre-computed data for Scoring.
|
||||
balancedAllocationPreScoreStateKey = "PreScore" + BalancedAllocationName
|
||||
)
|
||||
|
||||
// balancedAllocationPreScoreState computed at PreScore and used at Score.
|
||||
type balancedAllocationPreScoreState struct {
|
||||
// podRequests have the same order of the resources defined in NodeResourcesFitArgs.Resources,
|
||||
// same for other place we store a list like that.
|
||||
podRequests []int64
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *balancedAllocationPreScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
|
||||
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
state := &balancedAllocationPreScoreState{
|
||||
podRequests: ba.calculatePodResourceRequestList(pod, ba.resources),
|
||||
}
|
||||
cycleState.Write(balancedAllocationPreScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getBalancedAllocationPreScoreState(cycleState *framework.CycleState) (*balancedAllocationPreScoreState, error) {
|
||||
c, err := cycleState.Read(balancedAllocationPreScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %w", balancedAllocationPreScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*balancedAllocationPreScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (ba *BalancedAllocation) Name() string {
|
||||
return BalancedAllocationName
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := ba.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
s, err := getBalancedAllocationPreScoreState(state)
|
||||
if err != nil {
|
||||
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
|
||||
}
|
||||
|
||||
// ba.score favors nodes with balanced resource usage rate.
|
||||
// It calculates the standard deviation for those resources and prioritizes the node based on how close the usage of those resources is to each other.
|
||||
// Detail: score = (1 - std) * MaxNodeScore, where std is calculated by the root square of Σ((fraction(i)-mean)^2)/len(resources)
|
||||
// The algorithm is partly inspired by:
|
||||
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
|
||||
return ba.score(ctx, pod, nodeInfo, s.podRequests)
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (ba *BalancedAllocation) ScoreExtensions() framework.ScoreExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewBalancedAllocation initializes a new plugin and returns it.
|
||||
func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
args, ok := baArgs.(*config.NodeResourcesBalancedAllocationArgs)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("want args to be of type NodeResourcesBalancedAllocationArgs, got %T", baArgs)
|
||||
}
|
||||
|
||||
if err := validation.ValidateNodeResourcesBalancedAllocationArgs(nil, args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &BalancedAllocation{
|
||||
handle: h,
|
||||
resourceAllocationScorer: resourceAllocationScorer{
|
||||
Name: BalancedAllocationName,
|
||||
scorer: balancedResourceScorer,
|
||||
useRequested: true,
|
||||
resources: args.Resources,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func balancedResourceScorer(requested, allocable []int64) int64 {
|
||||
var resourceToFractions []float64
|
||||
var totalFraction float64
|
||||
for i := range requested {
|
||||
if allocable[i] == 0 {
|
||||
continue
|
||||
}
|
||||
fraction := float64(requested[i]) / float64(allocable[i])
|
||||
if fraction > 1 {
|
||||
fraction = 1
|
||||
}
|
||||
totalFraction += fraction
|
||||
resourceToFractions = append(resourceToFractions, fraction)
|
||||
}
|
||||
|
||||
std := 0.0
|
||||
|
||||
// For most cases, resources are limited to cpu and memory, the std could be simplified to std := (fraction1-fraction2)/2
|
||||
// len(fractions) > 2: calculate std based on the well-known formula - root square of Σ((fraction(i)-mean)^2)/len(fractions)
|
||||
// Otherwise, set the std to zero is enough.
|
||||
if len(resourceToFractions) == 2 {
|
||||
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
|
||||
|
||||
} else if len(resourceToFractions) > 2 {
|
||||
mean := totalFraction / float64(len(resourceToFractions))
|
||||
var sum float64
|
||||
for _, fraction := range resourceToFractions {
|
||||
sum = sum + (fraction-mean)*(fraction-mean)
|
||||
}
|
||||
std = math.Sqrt(sum / float64(len(resourceToFractions)))
|
||||
}
|
||||
|
||||
// STD (standard deviation) is always a positive value. 1-deviation lets the score to be higher for node which has least deviation and
|
||||
// multiplying it with `MaxNodeScore` provides the scaling factor needed.
|
||||
return int64((1 - std) * float64(framework.MaxNodeScore))
|
||||
}
|
596
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/fit.go
generated
vendored
Normal file
596
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/fit.go
generated
vendored
Normal file
@ -0,0 +1,596 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/component-helpers/resource"
|
||||
"k8s.io/klog/v2"
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
var _ framework.PreFilterPlugin = &Fit{}
|
||||
var _ framework.FilterPlugin = &Fit{}
|
||||
var _ framework.EnqueueExtensions = &Fit{}
|
||||
var _ framework.PreScorePlugin = &Fit{}
|
||||
var _ framework.ScorePlugin = &Fit{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.NodeResourcesFit
|
||||
|
||||
// preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
|
||||
// Using the name of the plugin will likely help us avoid collisions with other plugins.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// preScoreStateKey is the key in CycleState to NodeResourcesFit pre-computed data for Scoring.
|
||||
preScoreStateKey = "PreScore" + Name
|
||||
)
|
||||
|
||||
// nodeResourceStrategyTypeMap maps strategy to scorer implementation
|
||||
var nodeResourceStrategyTypeMap = map[config.ScoringStrategyType]scorer{
|
||||
config.LeastAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
|
||||
resources := args.ScoringStrategy.Resources
|
||||
return &resourceAllocationScorer{
|
||||
Name: string(config.LeastAllocated),
|
||||
scorer: leastResourceScorer(resources),
|
||||
resources: resources,
|
||||
}
|
||||
},
|
||||
config.MostAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
|
||||
resources := args.ScoringStrategy.Resources
|
||||
return &resourceAllocationScorer{
|
||||
Name: string(config.MostAllocated),
|
||||
scorer: mostResourceScorer(resources),
|
||||
resources: resources,
|
||||
}
|
||||
},
|
||||
config.RequestedToCapacityRatio: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
|
||||
resources := args.ScoringStrategy.Resources
|
||||
return &resourceAllocationScorer{
|
||||
Name: string(config.RequestedToCapacityRatio),
|
||||
scorer: requestedToCapacityRatioScorer(resources, args.ScoringStrategy.RequestedToCapacityRatio.Shape),
|
||||
resources: resources,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Fit is a plugin that checks if a node has sufficient resources.
|
||||
type Fit struct {
|
||||
ignoredResources sets.Set[string]
|
||||
ignoredResourceGroups sets.Set[string]
|
||||
enableInPlacePodVerticalScaling bool
|
||||
enableSidecarContainers bool
|
||||
enableSchedulingQueueHint bool
|
||||
enablePodLevelResources bool
|
||||
handle framework.Handle
|
||||
resourceAllocationScorer
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (f *Fit) ScoreExtensions() framework.ScoreExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
type preFilterState struct {
|
||||
framework.Resource
|
||||
}
|
||||
|
||||
// Clone the prefilter state.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// preScoreState computed at PreScore and used at Score.
|
||||
type preScoreState struct {
|
||||
// podRequests have the same order as the resources defined in NodeResourcesBalancedAllocationArgs.Resources,
|
||||
// same for other place we store a list like that.
|
||||
podRequests []int64
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *preScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
|
||||
func (f *Fit) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
state := &preScoreState{
|
||||
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
|
||||
}
|
||||
cycleState.Write(preScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
|
||||
c, err := cycleState.Read(preScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (f *Fit) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// NewFit initializes a new plugin and returns it.
|
||||
func NewFit(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
args, ok := plArgs.(*config.NodeResourcesFitArgs)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", plArgs)
|
||||
}
|
||||
if err := validation.ValidateNodeResourcesFitArgs(nil, args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if args.ScoringStrategy == nil {
|
||||
return nil, fmt.Errorf("scoring strategy not specified")
|
||||
}
|
||||
|
||||
strategy := args.ScoringStrategy.Type
|
||||
scorePlugin, exists := nodeResourceStrategyTypeMap[strategy]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("scoring strategy %s is not supported", strategy)
|
||||
}
|
||||
|
||||
return &Fit{
|
||||
ignoredResources: sets.New(args.IgnoredResources...),
|
||||
ignoredResourceGroups: sets.New(args.IgnoredResourceGroups...),
|
||||
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
|
||||
enableSidecarContainers: fts.EnableSidecarContainers,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
handle: h,
|
||||
enablePodLevelResources: fts.EnablePodLevelResources,
|
||||
resourceAllocationScorer: *scorePlugin(args),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type ResourceRequestsOptions struct {
|
||||
EnablePodLevelResources bool
|
||||
}
|
||||
|
||||
// computePodResourceRequest returns a framework.Resource that covers the largest
|
||||
// width in each resource dimension. Because init-containers run sequentially, we collect
|
||||
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
|
||||
// regular containers since they run simultaneously.
|
||||
//
|
||||
// # The resources defined for Overhead should be added to the calculated Resource request sum
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// Pod:
|
||||
//
|
||||
// InitContainers
|
||||
// IC1:
|
||||
// CPU: 2
|
||||
// Memory: 1G
|
||||
// IC2:
|
||||
// CPU: 2
|
||||
// Memory: 3G
|
||||
// Containers
|
||||
// C1:
|
||||
// CPU: 2
|
||||
// Memory: 1G
|
||||
// C2:
|
||||
// CPU: 1
|
||||
// Memory: 1G
|
||||
//
|
||||
// Result: CPU: 3, Memory: 3G
|
||||
// TODO(ndixita): modify computePodResourceRequest to accept opts of type
|
||||
// ResourceRequestOptions as the second parameter.
|
||||
func computePodResourceRequest(pod *v1.Pod, opts ResourceRequestsOptions) *preFilterState {
|
||||
// pod hasn't scheduled yet so we don't need to worry about InPlacePodVerticalScalingEnabled
|
||||
reqs := resource.PodRequests(pod, resource.PodResourcesOptions{
|
||||
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
||||
SkipPodLevelResources: !opts.EnablePodLevelResources,
|
||||
})
|
||||
result := &preFilterState{}
|
||||
result.SetMaxResource(reqs)
|
||||
return result
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point.
|
||||
func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
if !f.enableSidecarContainers && hasRestartableInitContainer(pod) {
|
||||
// Scheduler will calculate resources usage for a Pod containing
|
||||
// restartable init containers that will be equal or more than kubelet will
|
||||
// require to run the Pod. So there will be no overbooking. However, to
|
||||
// avoid the inconsistency in resource calculation between the scheduler
|
||||
// and the older (before v1.28) kubelet, make the Pod unschedulable.
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "Pod has a restartable init container and the SidecarContainers feature is disabled")
|
||||
}
|
||||
cycleState.Write(preFilterStateKey, computePodResourceRequest(pod, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}))
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (f *Fit) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
|
||||
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to NodeResourcesFit.preFilterState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (f *Fit) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
podActionType := framework.Delete
|
||||
if f.enableInPlacePodVerticalScaling {
|
||||
// If InPlacePodVerticalScaling (KEP 1287) is enabled, then UpdatePodScaleDown event should be registered
|
||||
// for this plugin since a Pod update may free up resources that make other Pods schedulable.
|
||||
podActionType |= framework.UpdatePodScaleDown
|
||||
}
|
||||
|
||||
// A note about UpdateNodeTaint/UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeAllocatable because the only resource update could change the node resource fit plugin's result.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeAllocatable | framework.UpdateNodeTaint | framework.UpdateNodeLabel
|
||||
if f.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeAllocatable
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: f.isSchedulableAfterPodEvent},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: f.isSchedulableAfterNodeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodEvent is invoked whenever a pod deleted or scaled down. It checks whether
|
||||
// that change made a previously unschedulable pod schedulable.
|
||||
func (f *Fit) isSchedulableAfterPodEvent(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalPod, modifiedPod, err := schedutil.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if modifiedPod == nil {
|
||||
if originalPod.Spec.NodeName == "" {
|
||||
logger.V(5).Info("the deleted pod was unscheduled and it wouldn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// any deletion event to a scheduled pod could make the unscheduled pod schedulable.
|
||||
logger.V(5).Info("another scheduled pod was deleted, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if !f.enableInPlacePodVerticalScaling {
|
||||
// If InPlacePodVerticalScaling (KEP 1287) is disabled, the pod scale down event cannot free up any resources.
|
||||
logger.V(5).Info("another pod was modified, but InPlacePodVerticalScaling is disabled, so it doesn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if !f.isSchedulableAfterPodScaleDown(pod, originalPod, modifiedPod) {
|
||||
if loggerV := logger.V(10); loggerV.Enabled() {
|
||||
// Log more information.
|
||||
loggerV.Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod), "diff", cmp.Diff(originalPod, modifiedPod))
|
||||
} else {
|
||||
logger.V(5).Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
}
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("another scheduled pod or the target pod itself got scaled down, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodScaleDown checks whether the scale down event may make the target pod schedulable. Specifically:
|
||||
// - Returns true when the update event is for the target pod itself.
|
||||
// - Returns true when the update event shows a scheduled pod's resource request that the target pod also requests got reduced.
|
||||
func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod *v1.Pod) bool {
|
||||
if modifiedPod.UID == targetPod.UID {
|
||||
// If the scaling down event is for targetPod, it would make targetPod schedulable.
|
||||
return true
|
||||
}
|
||||
|
||||
if modifiedPod.Spec.NodeName == "" {
|
||||
// If the update event is for a unscheduled Pod,
|
||||
// it wouldn't make targetPod schedulable.
|
||||
return false
|
||||
}
|
||||
|
||||
// the other pod was scheduled, so modification or deletion may free up some resources.
|
||||
originalMaxResourceReq, modifiedMaxResourceReq := &framework.Resource{}, &framework.Resource{}
|
||||
originalMaxResourceReq.SetMaxResource(resource.PodRequests(originalPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
|
||||
modifiedMaxResourceReq.SetMaxResource(resource.PodRequests(modifiedPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
|
||||
|
||||
// check whether the resource request of the modified pod is less than the original pod.
|
||||
podRequests := resource.PodRequests(targetPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling})
|
||||
for rName, rValue := range podRequests {
|
||||
if rValue.IsZero() {
|
||||
// We only care about the resources requested by the pod we are trying to schedule.
|
||||
continue
|
||||
}
|
||||
switch rName {
|
||||
case v1.ResourceCPU:
|
||||
if originalMaxResourceReq.MilliCPU > modifiedMaxResourceReq.MilliCPU {
|
||||
return true
|
||||
}
|
||||
case v1.ResourceMemory:
|
||||
if originalMaxResourceReq.Memory > modifiedMaxResourceReq.Memory {
|
||||
return true
|
||||
}
|
||||
case v1.ResourceEphemeralStorage:
|
||||
if originalMaxResourceReq.EphemeralStorage > modifiedMaxResourceReq.EphemeralStorage {
|
||||
return true
|
||||
}
|
||||
default:
|
||||
if schedutil.IsScalarResourceName(rName) && originalMaxResourceReq.ScalarResources[rName] > modifiedMaxResourceReq.ScalarResources[rName] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
|
||||
// that change could make a previously unschedulable pod schedulable.
|
||||
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
// Leaving in the queue, since the pod won't fit into the modified node anyway.
|
||||
if !isFit(pod, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
|
||||
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
// The pod will fit, so since it's add, unblock scheduling.
|
||||
if originalNode == nil {
|
||||
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
|
||||
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
|
||||
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
|
||||
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node, opts ResourceRequestsOptions) bool {
|
||||
podRequest := computePodResourceRequest(pod, opts)
|
||||
originalNodeInfo := framework.NewNodeInfo()
|
||||
originalNodeInfo.SetNode(originalNode)
|
||||
modifiedNodeInfo := framework.NewNodeInfo()
|
||||
modifiedNodeInfo.SetNode(modifiedNode)
|
||||
|
||||
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
|
||||
return true
|
||||
}
|
||||
|
||||
if podRequest.MilliCPU == 0 &&
|
||||
podRequest.Memory == 0 &&
|
||||
podRequest.EphemeralStorage == 0 &&
|
||||
len(podRequest.ScalarResources) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
|
||||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
|
||||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
|
||||
return true
|
||||
}
|
||||
|
||||
for rName, rQuant := range podRequest.ScalarResources {
|
||||
// Skip in case request quantity is zero
|
||||
if rQuant == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isFit checks if the pod fits the node. If the node is nil, it returns false.
|
||||
// It constructs a fake NodeInfo object for the node and checks if the pod fits the node.
|
||||
func isFit(pod *v1.Pod, node *v1.Node, opts ResourceRequestsOptions) bool {
|
||||
if node == nil {
|
||||
return false
|
||||
}
|
||||
nodeInfo := framework.NewNodeInfo()
|
||||
nodeInfo.SetNode(node)
|
||||
return len(Fits(pod, nodeInfo, opts)) == 0
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
// Checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
|
||||
// It returns a list of insufficient resources, if empty, then the node has all the resources requested by the pod.
|
||||
func (f *Fit) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
insufficientResources := fitsRequest(s, nodeInfo, f.ignoredResources, f.ignoredResourceGroups)
|
||||
|
||||
if len(insufficientResources) != 0 {
|
||||
// We will keep all failure reasons.
|
||||
failureReasons := make([]string, 0, len(insufficientResources))
|
||||
for i := range insufficientResources {
|
||||
failureReasons = append(failureReasons, insufficientResources[i].Reason)
|
||||
}
|
||||
return framework.NewStatus(framework.Unschedulable, failureReasons...)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func hasRestartableInitContainer(pod *v1.Pod) bool {
|
||||
for _, c := range pod.Spec.InitContainers {
|
||||
if c.RestartPolicy != nil && *c.RestartPolicy == v1.ContainerRestartPolicyAlways {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// InsufficientResource describes what kind of resource limit is hit and caused the pod to not fit the node.
|
||||
type InsufficientResource struct {
|
||||
ResourceName v1.ResourceName
|
||||
// We explicitly have a parameter for reason to avoid formatting a message on the fly
|
||||
// for common resources, which is expensive for cluster autoscaler simulations.
|
||||
Reason string
|
||||
Requested int64
|
||||
Used int64
|
||||
Capacity int64
|
||||
}
|
||||
|
||||
// Fits checks if node have enough resources to host the pod.
|
||||
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo, opts ResourceRequestsOptions) []InsufficientResource {
|
||||
return fitsRequest(computePodResourceRequest(pod, opts), nodeInfo, nil, nil)
|
||||
}
|
||||
|
||||
func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo, ignoredExtendedResources, ignoredResourceGroups sets.Set[string]) []InsufficientResource {
|
||||
insufficientResources := make([]InsufficientResource, 0, 4)
|
||||
|
||||
allowedPodNumber := nodeInfo.Allocatable.AllowedPodNumber
|
||||
if len(nodeInfo.Pods)+1 > allowedPodNumber {
|
||||
insufficientResources = append(insufficientResources, InsufficientResource{
|
||||
ResourceName: v1.ResourcePods,
|
||||
Reason: "Too many pods",
|
||||
Requested: 1,
|
||||
Used: int64(len(nodeInfo.Pods)),
|
||||
Capacity: int64(allowedPodNumber),
|
||||
})
|
||||
}
|
||||
|
||||
if podRequest.MilliCPU == 0 &&
|
||||
podRequest.Memory == 0 &&
|
||||
podRequest.EphemeralStorage == 0 &&
|
||||
len(podRequest.ScalarResources) == 0 {
|
||||
return insufficientResources
|
||||
}
|
||||
|
||||
if podRequest.MilliCPU > 0 && podRequest.MilliCPU > (nodeInfo.Allocatable.MilliCPU-nodeInfo.Requested.MilliCPU) {
|
||||
insufficientResources = append(insufficientResources, InsufficientResource{
|
||||
ResourceName: v1.ResourceCPU,
|
||||
Reason: "Insufficient cpu",
|
||||
Requested: podRequest.MilliCPU,
|
||||
Used: nodeInfo.Requested.MilliCPU,
|
||||
Capacity: nodeInfo.Allocatable.MilliCPU,
|
||||
})
|
||||
}
|
||||
if podRequest.Memory > 0 && podRequest.Memory > (nodeInfo.Allocatable.Memory-nodeInfo.Requested.Memory) {
|
||||
insufficientResources = append(insufficientResources, InsufficientResource{
|
||||
ResourceName: v1.ResourceMemory,
|
||||
Reason: "Insufficient memory",
|
||||
Requested: podRequest.Memory,
|
||||
Used: nodeInfo.Requested.Memory,
|
||||
Capacity: nodeInfo.Allocatable.Memory,
|
||||
})
|
||||
}
|
||||
if podRequest.EphemeralStorage > 0 &&
|
||||
podRequest.EphemeralStorage > (nodeInfo.Allocatable.EphemeralStorage-nodeInfo.Requested.EphemeralStorage) {
|
||||
insufficientResources = append(insufficientResources, InsufficientResource{
|
||||
ResourceName: v1.ResourceEphemeralStorage,
|
||||
Reason: "Insufficient ephemeral-storage",
|
||||
Requested: podRequest.EphemeralStorage,
|
||||
Used: nodeInfo.Requested.EphemeralStorage,
|
||||
Capacity: nodeInfo.Allocatable.EphemeralStorage,
|
||||
})
|
||||
}
|
||||
|
||||
for rName, rQuant := range podRequest.ScalarResources {
|
||||
// Skip in case request quantity is zero
|
||||
if rQuant == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if v1helper.IsExtendedResourceName(rName) {
|
||||
// If this resource is one of the extended resources that should be ignored, we will skip checking it.
|
||||
// rName is guaranteed to have a slash due to API validation.
|
||||
var rNamePrefix string
|
||||
if ignoredResourceGroups.Len() > 0 {
|
||||
rNamePrefix = strings.Split(string(rName), "/")[0]
|
||||
}
|
||||
if ignoredExtendedResources.Has(string(rName)) || ignoredResourceGroups.Has(rNamePrefix) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if rQuant > (nodeInfo.Allocatable.ScalarResources[rName] - nodeInfo.Requested.ScalarResources[rName]) {
|
||||
insufficientResources = append(insufficientResources, InsufficientResource{
|
||||
ResourceName: rName,
|
||||
Reason: fmt.Sprintf("Insufficient %v", rName),
|
||||
Requested: podRequest.ScalarResources[rName],
|
||||
Used: nodeInfo.Requested.ScalarResources[rName],
|
||||
Capacity: nodeInfo.Allocatable.ScalarResources[rName],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return insufficientResources
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := f.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
s, err := getPreScoreState(state)
|
||||
if err != nil {
|
||||
s = &preScoreState{
|
||||
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
|
||||
}
|
||||
}
|
||||
|
||||
return f.score(ctx, pod, nodeInfo, s.podRequests)
|
||||
}
|
61
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/least_allocated.go
generated
vendored
Normal file
61
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/least_allocated.go
generated
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
// leastResourceScorer favors nodes with fewer requested resources.
|
||||
// It calculates the percentage of memory, CPU and other resources requested by pods scheduled on the node, and
|
||||
// prioritizes based on the minimum of the average of the fraction of requested to capacity.
|
||||
//
|
||||
// Details:
|
||||
// (cpu((capacity-requested)*MaxNodeScore*cpuWeight/capacity) + memory((capacity-requested)*MaxNodeScore*memoryWeight/capacity) + ...)/weightSum
|
||||
func leastResourceScorer(resources []config.ResourceSpec) func([]int64, []int64) int64 {
|
||||
return func(requested, allocable []int64) int64 {
|
||||
var nodeScore, weightSum int64
|
||||
for i := range requested {
|
||||
if allocable[i] == 0 {
|
||||
continue
|
||||
}
|
||||
weight := resources[i].Weight
|
||||
resourceScore := leastRequestedScore(requested[i], allocable[i])
|
||||
nodeScore += resourceScore * weight
|
||||
weightSum += weight
|
||||
}
|
||||
if weightSum == 0 {
|
||||
return 0
|
||||
}
|
||||
return nodeScore / weightSum
|
||||
}
|
||||
}
|
||||
|
||||
// The unused capacity is calculated on a scale of 0-MaxNodeScore
|
||||
// 0 being the lowest priority and `MaxNodeScore` being the highest.
|
||||
// The more unused resources the higher the score is.
|
||||
func leastRequestedScore(requested, capacity int64) int64 {
|
||||
if capacity == 0 {
|
||||
return 0
|
||||
}
|
||||
if requested > capacity {
|
||||
return 0
|
||||
}
|
||||
|
||||
return ((capacity - requested) * framework.MaxNodeScore) / capacity
|
||||
}
|
65
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/most_allocated.go
generated
vendored
Normal file
65
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/most_allocated.go
generated
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
// mostResourceScorer favors nodes with most requested resources.
|
||||
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
|
||||
// based on the maximum of the average of the fraction of requested to capacity.
|
||||
//
|
||||
// Details:
|
||||
// (cpu(MaxNodeScore * requested * cpuWeight / capacity) + memory(MaxNodeScore * requested * memoryWeight / capacity) + ...) / weightSum
|
||||
func mostResourceScorer(resources []config.ResourceSpec) func(requested, allocable []int64) int64 {
|
||||
return func(requested, allocable []int64) int64 {
|
||||
var nodeScore, weightSum int64
|
||||
for i := range requested {
|
||||
if allocable[i] == 0 {
|
||||
continue
|
||||
}
|
||||
weight := resources[i].Weight
|
||||
resourceScore := mostRequestedScore(requested[i], allocable[i])
|
||||
nodeScore += resourceScore * weight
|
||||
weightSum += weight
|
||||
}
|
||||
if weightSum == 0 {
|
||||
return 0
|
||||
}
|
||||
return nodeScore / weightSum
|
||||
}
|
||||
}
|
||||
|
||||
// The used capacity is calculated on a scale of 0-MaxNodeScore (MaxNodeScore is
|
||||
// constant with value set to 100).
|
||||
// 0 being the lowest priority and 100 being the highest.
|
||||
// The more resources are used the higher the score is. This function
|
||||
// is almost a reversed version of noderesources.leastRequestedScore.
|
||||
func mostRequestedScore(requested, capacity int64) int64 {
|
||||
if capacity == 0 {
|
||||
return 0
|
||||
}
|
||||
if requested > capacity {
|
||||
// `requested` might be greater than `capacity` because pods with no
|
||||
// requests get minimum values.
|
||||
requested = capacity
|
||||
}
|
||||
|
||||
return (requested * framework.MaxNodeScore) / capacity
|
||||
}
|
73
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/requested_to_capacity_ratio.go
generated
vendored
Normal file
73
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/requested_to_capacity_ratio.go
generated
vendored
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
)
|
||||
|
||||
const maxUtilization = 100
|
||||
|
||||
// buildRequestedToCapacityRatioScorerFunction allows users to apply bin packing
|
||||
// on core resources like CPU, Memory as well as extended resources like accelerators.
|
||||
func buildRequestedToCapacityRatioScorerFunction(scoringFunctionShape helper.FunctionShape, resources []config.ResourceSpec) func([]int64, []int64) int64 {
|
||||
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
|
||||
resourceScoringFunction := func(requested, capacity int64) int64 {
|
||||
if capacity == 0 || requested > capacity {
|
||||
return rawScoringFunction(maxUtilization)
|
||||
}
|
||||
|
||||
return rawScoringFunction(requested * maxUtilization / capacity)
|
||||
}
|
||||
return func(requested, allocable []int64) int64 {
|
||||
var nodeScore, weightSum int64
|
||||
for i := range requested {
|
||||
if allocable[i] == 0 {
|
||||
continue
|
||||
}
|
||||
weight := resources[i].Weight
|
||||
resourceScore := resourceScoringFunction(requested[i], allocable[i])
|
||||
if resourceScore > 0 {
|
||||
nodeScore += resourceScore * weight
|
||||
weightSum += weight
|
||||
}
|
||||
}
|
||||
if weightSum == 0 {
|
||||
return 0
|
||||
}
|
||||
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
|
||||
}
|
||||
}
|
||||
|
||||
func requestedToCapacityRatioScorer(resources []config.ResourceSpec, shape []config.UtilizationShapePoint) func([]int64, []int64) int64 {
|
||||
shapes := make([]helper.FunctionShapePoint, 0, len(shape))
|
||||
for _, point := range shape {
|
||||
shapes = append(shapes, helper.FunctionShapePoint{
|
||||
Utilization: int64(point.Utilization),
|
||||
// MaxCustomPriorityScore may diverge from the max score used in the scheduler and defined by MaxNodeScore,
|
||||
// therefore we need to scale the score returned by requested to capacity ratio to the score range
|
||||
// used by the scheduler.
|
||||
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
|
||||
})
|
||||
}
|
||||
|
||||
return buildRequestedToCapacityRatioScorerFunction(shapes, resources)
|
||||
}
|
148
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go
generated
vendored
Normal file
148
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/resource_allocation.go
generated
vendored
Normal file
@ -0,0 +1,148 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
resourcehelper "k8s.io/component-helpers/resource"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// scorer is decorator for resourceAllocationScorer
|
||||
type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
|
||||
|
||||
// resourceAllocationScorer contains information to calculate resource allocation score.
|
||||
type resourceAllocationScorer struct {
|
||||
Name string
|
||||
// used to decide whether to use Requested or NonZeroRequested for
|
||||
// cpu and memory.
|
||||
useRequested bool
|
||||
scorer func(requested, allocable []int64) int64
|
||||
resources []config.ResourceSpec
|
||||
}
|
||||
|
||||
// score will use `scorer` function to calculate the score.
|
||||
func (r *resourceAllocationScorer) score(
|
||||
ctx context.Context,
|
||||
pod *v1.Pod,
|
||||
nodeInfo *framework.NodeInfo,
|
||||
podRequests []int64) (int64, *framework.Status) {
|
||||
logger := klog.FromContext(ctx)
|
||||
node := nodeInfo.Node()
|
||||
|
||||
// resources not set, nothing scheduled,
|
||||
if len(r.resources) == 0 {
|
||||
return 0, framework.NewStatus(framework.Error, "resources not found")
|
||||
}
|
||||
|
||||
requested := make([]int64, len(r.resources))
|
||||
allocatable := make([]int64, len(r.resources))
|
||||
for i := range r.resources {
|
||||
alloc, req := r.calculateResourceAllocatableRequest(logger, nodeInfo, v1.ResourceName(r.resources[i].Name), podRequests[i])
|
||||
// Only fill the extended resource entry when it's non-zero.
|
||||
if alloc == 0 {
|
||||
continue
|
||||
}
|
||||
allocatable[i] = alloc
|
||||
requested[i] = req
|
||||
}
|
||||
|
||||
score := r.scorer(requested, allocatable)
|
||||
|
||||
if loggerV := logger.V(10); loggerV.Enabled() { // Serializing these maps is costly.
|
||||
loggerV.Info("Listed internal info for allocatable resources, requested resources and score", "pod",
|
||||
klog.KObj(pod), "node", klog.KObj(node), "resourceAllocationScorer", r.Name,
|
||||
"allocatableResource", allocatable, "requestedResource", requested, "resourceScore", score,
|
||||
)
|
||||
}
|
||||
|
||||
return score, nil
|
||||
}
|
||||
|
||||
// calculateResourceAllocatableRequest returns 2 parameters:
|
||||
// - 1st param: quantity of allocatable resource on the node.
|
||||
// - 2nd param: aggregated quantity of requested resource on the node.
|
||||
// Note: if it's an extended resource, and the pod doesn't request it, (0, 0) is returned.
|
||||
func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(logger klog.Logger, nodeInfo *framework.NodeInfo, resource v1.ResourceName, podRequest int64) (int64, int64) {
|
||||
requested := nodeInfo.NonZeroRequested
|
||||
if r.useRequested {
|
||||
requested = nodeInfo.Requested
|
||||
}
|
||||
|
||||
// If it's an extended resource, and the pod doesn't request it. We return (0, 0)
|
||||
// as an implication to bypass scoring on this resource.
|
||||
if podRequest == 0 && schedutil.IsScalarResourceName(resource) {
|
||||
return 0, 0
|
||||
}
|
||||
switch resource {
|
||||
case v1.ResourceCPU:
|
||||
return nodeInfo.Allocatable.MilliCPU, (requested.MilliCPU + podRequest)
|
||||
case v1.ResourceMemory:
|
||||
return nodeInfo.Allocatable.Memory, (requested.Memory + podRequest)
|
||||
case v1.ResourceEphemeralStorage:
|
||||
return nodeInfo.Allocatable.EphemeralStorage, (nodeInfo.Requested.EphemeralStorage + podRequest)
|
||||
default:
|
||||
if _, exists := nodeInfo.Allocatable.ScalarResources[resource]; exists {
|
||||
return nodeInfo.Allocatable.ScalarResources[resource], (nodeInfo.Requested.ScalarResources[resource] + podRequest)
|
||||
}
|
||||
}
|
||||
logger.V(10).Info("Requested resource is omitted for node score calculation", "resourceName", resource)
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
// calculatePodResourceRequest returns the total non-zero requests. If Overhead is defined for the pod
|
||||
// the Overhead is added to the result.
|
||||
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
|
||||
|
||||
opts := resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
||||
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
}
|
||||
|
||||
if !r.useRequested {
|
||||
opts.NonMissingContainerRequests = v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(schedutil.DefaultMilliCPURequest, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(schedutil.DefaultMemoryRequest, resource.DecimalSI),
|
||||
}
|
||||
}
|
||||
|
||||
requests := resourcehelper.PodRequests(pod, opts)
|
||||
|
||||
quantity := requests[resourceName]
|
||||
if resourceName == v1.ResourceCPU {
|
||||
return quantity.MilliValue()
|
||||
}
|
||||
return quantity.Value()
|
||||
}
|
||||
|
||||
func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, resources []config.ResourceSpec) []int64 {
|
||||
podRequests := make([]int64, len(resources))
|
||||
for i := range resources {
|
||||
podRequests[i] = r.calculatePodResourceRequest(pod, v1.ResourceName(resources[i].Name))
|
||||
}
|
||||
return podRequests
|
||||
}
|
57
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/test_util.go
generated
vendored
Normal file
57
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/test_util.go
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/validation/field"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
)
|
||||
|
||||
var (
|
||||
ignoreBadValueDetail = cmpopts.IgnoreFields(field.Error{}, "BadValue", "Detail")
|
||||
defaultResources = []config.ResourceSpec{
|
||||
{Name: string(v1.ResourceCPU), Weight: 1},
|
||||
{Name: string(v1.ResourceMemory), Weight: 1},
|
||||
}
|
||||
extendedRes = "abc.com/xyz"
|
||||
extendedResourceSet = []config.ResourceSpec{
|
||||
{Name: string(v1.ResourceCPU), Weight: 1},
|
||||
{Name: string(v1.ResourceMemory), Weight: 1},
|
||||
{Name: extendedRes, Weight: 1},
|
||||
}
|
||||
)
|
||||
|
||||
func makeNode(node string, milliCPU, memory int64, extendedResource map[string]int64) *v1.Node {
|
||||
resourceList := make(map[v1.ResourceName]resource.Quantity)
|
||||
for res, quantity := range extendedResource {
|
||||
resourceList[v1.ResourceName(res)] = *resource.NewQuantity(quantity, resource.DecimalSI)
|
||||
}
|
||||
resourceList[v1.ResourceCPU] = *resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
|
||||
resourceList[v1.ResourceMemory] = *resource.NewQuantity(memory, resource.BinarySI)
|
||||
return &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: node},
|
||||
Status: v1.NodeStatus{
|
||||
Capacity: resourceList,
|
||||
Allocatable: resourceList,
|
||||
},
|
||||
}
|
||||
}
|
154
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go
generated
vendored
Normal file
154
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable/node_unschedulable.go
generated
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeunschedulable
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
v1helper "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// NodeUnschedulable plugin filters nodes that set node.Spec.Unschedulable=true unless
|
||||
// the pod tolerates {key=node.kubernetes.io/unschedulable, effect:NoSchedule} taint.
|
||||
type NodeUnschedulable struct {
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.FilterPlugin = &NodeUnschedulable{}
|
||||
var _ framework.EnqueueExtensions = &NodeUnschedulable{}
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.NodeUnschedulable
|
||||
|
||||
const (
|
||||
// ErrReasonUnknownCondition is used for NodeUnknownCondition predicate error.
|
||||
ErrReasonUnknownCondition = "node(s) had unknown conditions"
|
||||
// ErrReasonUnschedulable is used for NodeUnschedulable predicate error.
|
||||
ErrReasonUnschedulable = "node(s) were unschedulable"
|
||||
)
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
if !pl.enableSchedulingQueueHint {
|
||||
return []framework.ClusterEventWithHint{
|
||||
// A note about UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
// When the QueueingHint feature is enabled,
|
||||
// the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
|
||||
func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if pod.UID == modifiedPod.UID {
|
||||
// Note: we don't need to check oldPod tolerations the taint because:
|
||||
// - Taint can be added, but can't be modified nor removed.
|
||||
// - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place.
|
||||
// Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet.
|
||||
if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{
|
||||
Key: v1.TaintNodeUnschedulable,
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
}) {
|
||||
// This update makes the pod tolerate the unschedulable taint.
|
||||
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
|
||||
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange is invoked for all node events reported by
|
||||
// an informer. It checks whether that change made a previously unschedulable
|
||||
// pod schedulable.
|
||||
func (pl *NodeUnschedulable) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// We queue this Pod when -
|
||||
// 1. the node is updated from unschedulable to schedulable.
|
||||
// 2. the node is added and is schedulable.
|
||||
if (originalNode != nil && originalNode.Spec.Unschedulable && !modifiedNode.Spec.Unschedulable) ||
|
||||
(originalNode == nil && !modifiedNode.Spec.Unschedulable) {
|
||||
logger.V(5).Info("node was created or updated, pod may be schedulable now", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *NodeUnschedulable) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *NodeUnschedulable) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if !node.Spec.Unschedulable {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If pod tolerate unschedulable taint, it's also tolerate `node.Spec.Unschedulable`.
|
||||
podToleratesUnschedulable := v1helper.TolerationsTolerateTaint(pod.Spec.Tolerations, &v1.Taint{
|
||||
Key: v1.TaintNodeUnschedulable,
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
})
|
||||
if !podToleratesUnschedulable {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonUnschedulable)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
return &NodeUnschedulable{enableSchedulingQueueHint: fts.EnableSchedulingQueueHint}, nil
|
||||
}
|
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/OWNERS
generated
vendored
Normal file
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/OWNERS
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
approvers:
|
||||
- sig-storage-approvers
|
||||
- cofyc
|
||||
reviewers:
|
||||
- sig-storage-reviewers
|
||||
- cofyc
|
||||
labels:
|
||||
- sig/storage
|
539
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go
generated
vendored
Normal file
539
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go
generated
vendored
Normal file
@ -0,0 +1,539 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodevolumelimits
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/rand"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
storagelisters "k8s.io/client-go/listers/storage/v1"
|
||||
ephemeral "k8s.io/component-helpers/storage/ephemeral"
|
||||
storagehelpers "k8s.io/component-helpers/storage/volume"
|
||||
csitrans "k8s.io/csi-translation-lib"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
const (
|
||||
// ErrReasonMaxVolumeCountExceeded is used for MaxVolumeCount predicate error.
|
||||
ErrReasonMaxVolumeCountExceeded = "node(s) exceed max volume count"
|
||||
)
|
||||
|
||||
// InTreeToCSITranslator contains methods required to check migratable status
|
||||
// and perform translations from InTree PV's to CSI
|
||||
type InTreeToCSITranslator interface {
|
||||
IsPVMigratable(pv *v1.PersistentVolume) bool
|
||||
IsInlineMigratable(vol *v1.Volume) bool
|
||||
IsMigratableIntreePluginByName(inTreePluginName string) bool
|
||||
GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
|
||||
GetCSINameFromInTreeName(pluginName string) (string, error)
|
||||
TranslateInTreePVToCSI(logger klog.Logger, pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
|
||||
TranslateInTreeInlineVolumeToCSI(logger klog.Logger, volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error)
|
||||
}
|
||||
|
||||
// CSILimits is a plugin that checks node volume limits.
|
||||
type CSILimits struct {
|
||||
csiNodeLister storagelisters.CSINodeLister
|
||||
pvLister corelisters.PersistentVolumeLister
|
||||
pvcLister corelisters.PersistentVolumeClaimLister
|
||||
scLister storagelisters.StorageClassLister
|
||||
vaLister storagelisters.VolumeAttachmentLister
|
||||
|
||||
randomVolumeIDPrefix string
|
||||
|
||||
translator InTreeToCSITranslator
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &CSILimits{}
|
||||
var _ framework.FilterPlugin = &CSILimits{}
|
||||
var _ framework.EnqueueExtensions = &CSILimits{}
|
||||
|
||||
// CSIName is the name of the plugin used in the plugin registry and configurations.
|
||||
const CSIName = names.NodeVolumeLimits
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *CSILimits) Name() string {
|
||||
return CSIName
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod.
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *CSILimits) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
return []framework.ClusterEventWithHint{
|
||||
// We don't register any `QueueingHintFn` intentionally
|
||||
// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterPVCAdded},
|
||||
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
|
||||
}
|
||||
|
||||
if len(deletedPod.Spec.Volumes) == 0 {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if deletedPod.Spec.NodeName == "" {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
for _, vol := range deletedPod.Spec.Volumes {
|
||||
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(&vol) {
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("The deleted pod does not impact the scheduling of the unscheduled pod", "deletedPod", klog.KObj(pod), "pod", klog.KObj(deletedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
func (pl *CSILimits) isSchedulableAfterPVCAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, addedPvc, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPVCAdded: %w", err)
|
||||
}
|
||||
|
||||
if addedPvc.Namespace != pod.Namespace {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
for _, volumes := range pod.Spec.Volumes {
|
||||
var pvcName string
|
||||
switch {
|
||||
case volumes.PersistentVolumeClaim != nil:
|
||||
pvcName = volumes.PersistentVolumeClaim.ClaimName
|
||||
case volumes.Ephemeral != nil:
|
||||
pvcName = ephemeral.VolumeClaimName(pod, &volumes)
|
||||
default:
|
||||
// Volume is not using a PVC, ignore
|
||||
continue
|
||||
}
|
||||
|
||||
if pvcName == addedPvc.Name {
|
||||
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point
|
||||
//
|
||||
// If the pod haven't those types of volumes, we'll skip the Filter phase
|
||||
func (pl *CSILimits) PreFilter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
volumes := pod.Spec.Volumes
|
||||
for i := range volumes {
|
||||
vol := &volumes[i]
|
||||
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(vol) {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *CSILimits) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
// If the new pod doesn't have any volume attached to it, the predicate will always be true
|
||||
if len(pod.Spec.Volumes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
node := nodeInfo.Node()
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
csiNode, err := pl.csiNodeLister.Get(node.Name)
|
||||
if err != nil {
|
||||
// TODO: return the error once CSINode is created by default (2 releases)
|
||||
logger.V(5).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
|
||||
}
|
||||
|
||||
// Count CSI volumes from the new pod
|
||||
newVolumes := make(map[string]string)
|
||||
if err := pl.filterAttachableVolumes(logger, pod, csiNode, true /* new pod */, newVolumes); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
// PVC is not found. This Pod will never be schedulable until PVC is created.
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
|
||||
}
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
// If the pod doesn't have any new CSI volumes, the predicate will always be true
|
||||
if len(newVolumes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If the node doesn't have volume limits, the predicate will always be true
|
||||
nodeVolumeLimits := getVolumeLimits(csiNode)
|
||||
if len(nodeVolumeLimits) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Count CSI volumes from existing pods
|
||||
attachedVolumes := make(map[string]string)
|
||||
for _, existingPod := range nodeInfo.Pods {
|
||||
if err := pl.filterAttachableVolumes(logger, existingPod.Pod, csiNode, false /* existing pod */, attachedVolumes); err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
}
|
||||
|
||||
attachedVolumeCount := map[string]int{}
|
||||
for volumeUniqueName, driverName := range attachedVolumes {
|
||||
// Don't count single volume used in multiple pods more than once
|
||||
delete(newVolumes, volumeUniqueName)
|
||||
attachedVolumeCount[driverName]++
|
||||
}
|
||||
|
||||
// Count CSI volumes from VolumeAttachments
|
||||
volumeAttachments, err := pl.getNodeVolumeAttachmentInfo(logger, node.Name)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
for volumeUniqueName, driverName := range volumeAttachments {
|
||||
// Avoid double-counting volumes already used by existing pods
|
||||
if _, exists := attachedVolumes[volumeUniqueName]; !exists {
|
||||
attachedVolumeCount[driverName]++
|
||||
}
|
||||
}
|
||||
|
||||
// Count the new volumes count per driver
|
||||
newVolumeCount := map[string]int{}
|
||||
for _, driverName := range newVolumes {
|
||||
newVolumeCount[driverName]++
|
||||
}
|
||||
|
||||
for driverName, count := range newVolumeCount {
|
||||
maxVolumeLimit, ok := nodeVolumeLimits[driverName]
|
||||
if ok {
|
||||
currentVolumeCount := attachedVolumeCount[driverName]
|
||||
logger.V(5).Info("Found plugin volume limits", "node", node.Name, "driverName", driverName,
|
||||
"maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count,
|
||||
"pod", klog.KObj(pod))
|
||||
if currentVolumeCount+count > int(maxVolumeLimit) {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// filterAttachableVolumes filters the attachable volumes from the pod and adds them to the result map.
|
||||
// The result map is a map of volumeUniqueName to driver name. The volumeUniqueName is a unique name for
|
||||
// the volume in the format of "driverName/volumeHandle". And driver name is the CSI driver name.
|
||||
func (pl *CSILimits) filterAttachableVolumes(
|
||||
logger klog.Logger, pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error {
|
||||
for _, vol := range pod.Spec.Volumes {
|
||||
pvcName := ""
|
||||
isEphemeral := false
|
||||
switch {
|
||||
case vol.PersistentVolumeClaim != nil:
|
||||
// Normal CSI volume can only be used through PVC
|
||||
pvcName = vol.PersistentVolumeClaim.ClaimName
|
||||
case vol.Ephemeral != nil:
|
||||
// Generic ephemeral inline volumes also use a PVC,
|
||||
// just with a computed name and certain ownership.
|
||||
// That is checked below once the pvc object is
|
||||
// retrieved.
|
||||
pvcName = ephemeral.VolumeClaimName(pod, &vol)
|
||||
isEphemeral = true
|
||||
default:
|
||||
// Inline Volume does not have PVC.
|
||||
// Need to check if CSI migration is enabled for this inline volume.
|
||||
// - If the volume is migratable and CSI migration is enabled, need to count it
|
||||
// as well.
|
||||
// - If the volume is not migratable, it will be count in non_csi filter.
|
||||
if err := pl.checkAttachableInlineVolume(logger, &vol, csiNode, pod, result); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if pvcName == "" {
|
||||
return fmt.Errorf("PersistentVolumeClaim had no name")
|
||||
}
|
||||
|
||||
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
|
||||
|
||||
if err != nil {
|
||||
if newPod {
|
||||
// The PVC is required to proceed with
|
||||
// scheduling of a new pod because it cannot
|
||||
// run without it. Bail out immediately.
|
||||
return fmt.Errorf("looking up PVC %s/%s: %w", pod.Namespace, pvcName, err)
|
||||
}
|
||||
// If the PVC is invalid, we don't count the volume because
|
||||
// there's no guarantee that it belongs to the running predicate.
|
||||
logger.V(5).Info("Unable to look up PVC info", "pod", klog.KObj(pod), "PVC", klog.KRef(pod.Namespace, pvcName))
|
||||
continue
|
||||
}
|
||||
|
||||
// The PVC for an ephemeral volume must be owned by the pod.
|
||||
if isEphemeral {
|
||||
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
driverName, volumeHandle := pl.getCSIDriverInfo(logger, csiNode, pvc)
|
||||
if driverName == "" || volumeHandle == "" {
|
||||
logger.V(5).Info("Could not find a CSI driver name or volume handle, not counting volume")
|
||||
continue
|
||||
}
|
||||
|
||||
volumeUniqueName := getVolumeUniqueName(driverName, volumeHandle)
|
||||
result[volumeUniqueName] = driverName
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkAttachableInlineVolume takes an inline volume and add to the result map if the
|
||||
// volume is migratable and CSI migration for this plugin has been enabled.
|
||||
func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Volume, csiNode *storagev1.CSINode,
|
||||
pod *v1.Pod, result map[string]string) error {
|
||||
if !pl.translator.IsInlineMigratable(vol) {
|
||||
return nil
|
||||
}
|
||||
// Check if the intree provisioner CSI migration has been enabled.
|
||||
inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, vol)
|
||||
if err != nil {
|
||||
return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
|
||||
}
|
||||
if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
|
||||
csiNodeName := ""
|
||||
if csiNode != nil {
|
||||
csiNodeName = csiNode.Name
|
||||
}
|
||||
logger.V(5).Info("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName,
|
||||
"pod", klog.KObj(pod), "csiNode", csiNodeName)
|
||||
return nil
|
||||
}
|
||||
// Do translation for the in-tree volume.
|
||||
translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(logger, vol, pod.Namespace)
|
||||
if err != nil || translatedPV == nil {
|
||||
return fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
|
||||
}
|
||||
driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err)
|
||||
}
|
||||
// TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set,
|
||||
// the volume does not support inline. Skip the count.
|
||||
if translatedPV.Spec.PersistentVolumeSource.CSI == nil {
|
||||
return nil
|
||||
}
|
||||
volumeUniqueName := getVolumeUniqueName(driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle)
|
||||
result[volumeUniqueName] = driverName
|
||||
return nil
|
||||
}
|
||||
|
||||
// getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC.
|
||||
// If the PVC is from a migrated in-tree plugin, this function will return
|
||||
// the information of the CSI driver that the plugin has been migrated to.
|
||||
func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
pvName := pvc.Spec.VolumeName
|
||||
|
||||
if pvName == "" {
|
||||
logger.V(5).Info("Persistent volume had no name for claim", "PVC", klog.KObj(pvc))
|
||||
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
|
||||
}
|
||||
|
||||
pv, err := pl.pvLister.Get(pvName)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Unable to look up PV info for PVC and PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvName))
|
||||
// If we can't fetch PV associated with PVC, may be it got deleted
|
||||
// or PVC was prebound to a PVC that hasn't been created yet.
|
||||
// fallback to using StorageClass for volume counting
|
||||
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
|
||||
}
|
||||
|
||||
csiSource := pv.Spec.PersistentVolumeSource.CSI
|
||||
if csiSource == nil {
|
||||
// We make a fast path for non-CSI volumes that aren't migratable
|
||||
if !pl.translator.IsPVMigratable(pv) {
|
||||
return "", ""
|
||||
}
|
||||
|
||||
pluginName, err := pl.translator.GetInTreePluginNameFromSpec(pv, nil)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Unable to look up plugin name from PV spec", "err", err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if !isCSIMigrationOn(csiNode, pluginName) {
|
||||
logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
csiPV, err := pl.translator.TranslateInTreePVToCSI(logger, pv)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Unable to translate in-tree volume to CSI", "err", err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if csiPV.Spec.PersistentVolumeSource.CSI == nil {
|
||||
logger.V(5).Info("Unable to get a valid volume source for translated PV", "PV", pvName)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
csiSource = csiPV.Spec.PersistentVolumeSource.CSI
|
||||
}
|
||||
|
||||
return csiSource.Driver, csiSource.VolumeHandle
|
||||
}
|
||||
|
||||
// getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass.
|
||||
func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
namespace := pvc.Namespace
|
||||
pvcName := pvc.Name
|
||||
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
|
||||
|
||||
// If StorageClass is not set or not found, then PVC must be using immediate binding mode
|
||||
// and hence it must be bound before scheduling. So it is safe to not count it.
|
||||
if scName == "" {
|
||||
logger.V(5).Info("PVC has no StorageClass", "PVC", klog.KObj(pvc))
|
||||
return "", ""
|
||||
}
|
||||
|
||||
storageClass, err := pl.scLister.Get(scName)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Could not get StorageClass for PVC", "PVC", klog.KObj(pvc), "err", err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
// We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the
|
||||
// predicate and there is another pod on the same node that uses same volume, then we will overcount
|
||||
// the volume and consider both volumes as different.
|
||||
volumeHandle := fmt.Sprintf("%s-%s/%s", pl.randomVolumeIDPrefix, namespace, pvcName)
|
||||
|
||||
provisioner := storageClass.Provisioner
|
||||
if pl.translator.IsMigratableIntreePluginByName(provisioner) {
|
||||
if !isCSIMigrationOn(csiNode, provisioner) {
|
||||
logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
driverName, err := pl.translator.GetCSINameFromInTreeName(provisioner)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Unable to look up driver name from provisioner name", "provisioner", provisioner, "err", err)
|
||||
return "", ""
|
||||
}
|
||||
return driverName, volumeHandle
|
||||
}
|
||||
|
||||
return provisioner, volumeHandle
|
||||
}
|
||||
|
||||
// NewCSI initializes a new plugin and returns it.
|
||||
func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
informerFactory := handle.SharedInformerFactory()
|
||||
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
|
||||
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
|
||||
csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister()
|
||||
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
|
||||
vaLister := informerFactory.Storage().V1().VolumeAttachments().Lister()
|
||||
csiTranslator := csitrans.New()
|
||||
|
||||
return &CSILimits{
|
||||
csiNodeLister: csiNodesLister,
|
||||
pvLister: pvLister,
|
||||
pvcLister: pvcLister,
|
||||
scLister: scLister,
|
||||
vaLister: vaLister,
|
||||
randomVolumeIDPrefix: rand.String(32),
|
||||
translator: csiTranslator,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// getVolumeLimits reads the volume limits from CSINode object and returns a map of volume limits.
|
||||
// The key is the driver name and the value is the maximum number of volumes that can be attached to the node.
|
||||
// If a key is not found in the map, it means there is no limit for the driver on the node.
|
||||
func getVolumeLimits(csiNode *storagev1.CSINode) map[string]int64 {
|
||||
nodeVolumeLimits := make(map[string]int64)
|
||||
if csiNode == nil {
|
||||
return nodeVolumeLimits
|
||||
}
|
||||
for _, d := range csiNode.Spec.Drivers {
|
||||
if d.Allocatable != nil && d.Allocatable.Count != nil {
|
||||
nodeVolumeLimits[d.Name] = int64(*d.Allocatable.Count)
|
||||
}
|
||||
}
|
||||
return nodeVolumeLimits
|
||||
}
|
||||
|
||||
// getNodeVolumeAttachmentInfo returns a map of volumeID to driver name for the given node.
|
||||
func (pl *CSILimits) getNodeVolumeAttachmentInfo(logger klog.Logger, nodeName string) (map[string]string, error) {
|
||||
volumeAttachments := make(map[string]string)
|
||||
vas, err := pl.vaLister.List(labels.Everything())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, va := range vas {
|
||||
if va.Spec.NodeName == nodeName {
|
||||
if va.Spec.Attacher == "" {
|
||||
logger.V(5).Info("VolumeAttachment has no attacher", "VolumeAttachment", klog.KObj(va))
|
||||
continue
|
||||
}
|
||||
if va.Spec.Source.PersistentVolumeName == nil {
|
||||
logger.V(5).Info("VolumeAttachment has no PV name", "VolumeAttachment", klog.KObj(va))
|
||||
continue
|
||||
}
|
||||
pv, err := pl.pvLister.Get(*va.Spec.Source.PersistentVolumeName)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Unable to get PV for VolumeAttachment", "VolumeAttachment", klog.KObj(va), "err", err)
|
||||
continue
|
||||
}
|
||||
if pv.Spec.CSI == nil {
|
||||
logger.V(5).Info("PV is not a CSI volume", "PV", klog.KObj(pv))
|
||||
continue
|
||||
}
|
||||
volumeID := getVolumeUniqueName(va.Spec.Attacher, pv.Spec.CSI.VolumeHandle)
|
||||
volumeAttachments[volumeID] = va.Spec.Attacher
|
||||
}
|
||||
}
|
||||
return volumeAttachments, nil
|
||||
}
|
||||
|
||||
func getVolumeUniqueName(driverName, volumeHandle string) string {
|
||||
return fmt.Sprintf("%s/%s", driverName, volumeHandle)
|
||||
}
|
73
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/utils.go
generated
vendored
Normal file
73
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/utils.go
generated
vendored
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodevolumelimits
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
csilibplugins "k8s.io/csi-translation-lib/plugins"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
)
|
||||
|
||||
// isCSIMigrationOn returns a boolean value indicating whether
|
||||
// the CSI migration has been enabled for a particular storage plugin.
|
||||
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
|
||||
if csiNode == nil || len(pluginName) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// In-tree storage to CSI driver migration feature should be enabled,
|
||||
// along with the plugin-specific one
|
||||
switch pluginName {
|
||||
case csilibplugins.AWSEBSInTreePluginName:
|
||||
return true
|
||||
case csilibplugins.PortworxVolumePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) {
|
||||
return false
|
||||
}
|
||||
case csilibplugins.GCEPDInTreePluginName:
|
||||
return true
|
||||
case csilibplugins.AzureDiskInTreePluginName:
|
||||
return true
|
||||
case csilibplugins.CinderInTreePluginName:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
// The plugin name should be listed in the CSINode object annotation.
|
||||
// This indicates that the plugin has been migrated to a CSI driver in the node.
|
||||
csiNodeAnn := csiNode.GetAnnotations()
|
||||
if csiNodeAnn == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var mpaSet sets.Set[string]
|
||||
mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
|
||||
if len(mpa) == 0 {
|
||||
mpaSet = sets.New[string]()
|
||||
} else {
|
||||
tok := strings.Split(mpa, ",")
|
||||
mpaSet = sets.New(tok...)
|
||||
}
|
||||
|
||||
return mpaSet.Has(pluginName)
|
||||
}
|
174
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/common.go
generated
vendored
Normal file
174
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/common.go
generated
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package podtopologyspread
|
||||
|
||||
import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
v1helper "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
type topologyPair struct {
|
||||
key string
|
||||
value string
|
||||
}
|
||||
|
||||
// topologySpreadConstraint is an internal version for v1.TopologySpreadConstraint
|
||||
// and where the selector is parsed.
|
||||
// Fields are exported for comparison during testing.
|
||||
type topologySpreadConstraint struct {
|
||||
MaxSkew int32
|
||||
TopologyKey string
|
||||
Selector labels.Selector
|
||||
MinDomains int32
|
||||
NodeAffinityPolicy v1.NodeInclusionPolicy
|
||||
NodeTaintsPolicy v1.NodeInclusionPolicy
|
||||
}
|
||||
|
||||
func (tsc *topologySpreadConstraint) matchNodeInclusionPolicies(pod *v1.Pod, node *v1.Node, require nodeaffinity.RequiredNodeAffinity) bool {
|
||||
if tsc.NodeAffinityPolicy == v1.NodeInclusionPolicyHonor {
|
||||
// We ignore parsing errors here for backwards compatibility.
|
||||
if match, _ := require.Match(node); !match {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if tsc.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
|
||||
if _, untolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc()); untolerated {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// buildDefaultConstraints builds the constraints for a pod using
|
||||
// .DefaultConstraints and the selectors from the services, replication
|
||||
// controllers, replica sets and stateful sets that match the pod.
|
||||
func (pl *PodTopologySpread) buildDefaultConstraints(p *v1.Pod, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
|
||||
constraints, err := pl.filterTopologySpreadConstraints(pl.defaultConstraints, p.Labels, action)
|
||||
if err != nil || len(constraints) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
selector := helper.DefaultSelector(p, pl.services, pl.replicationCtrls, pl.replicaSets, pl.statefulSets)
|
||||
if selector.Empty() {
|
||||
return nil, nil
|
||||
}
|
||||
for i := range constraints {
|
||||
constraints[i].Selector = selector
|
||||
}
|
||||
return constraints, nil
|
||||
}
|
||||
|
||||
// nodeLabelsMatchSpreadConstraints checks if ALL topology keys in spread Constraints are present in node labels.
|
||||
func nodeLabelsMatchSpreadConstraints(nodeLabels map[string]string, constraints []topologySpreadConstraint) bool {
|
||||
for _, c := range constraints {
|
||||
if _, ok := nodeLabels[c.TopologyKey]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (pl *PodTopologySpread) filterTopologySpreadConstraints(constraints []v1.TopologySpreadConstraint, podLabels map[string]string, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
|
||||
var result []topologySpreadConstraint
|
||||
for _, c := range constraints {
|
||||
if c.WhenUnsatisfiable == action {
|
||||
selector, err := metav1.LabelSelectorAsSelector(c.LabelSelector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if pl.enableMatchLabelKeysInPodTopologySpread && len(c.MatchLabelKeys) > 0 {
|
||||
matchLabels := make(labels.Set)
|
||||
for _, labelKey := range c.MatchLabelKeys {
|
||||
if value, ok := podLabels[labelKey]; ok {
|
||||
matchLabels[labelKey] = value
|
||||
}
|
||||
}
|
||||
if len(matchLabels) > 0 {
|
||||
selector = mergeLabelSetWithSelector(matchLabels, selector)
|
||||
}
|
||||
}
|
||||
|
||||
tsc := topologySpreadConstraint{
|
||||
MaxSkew: c.MaxSkew,
|
||||
TopologyKey: c.TopologyKey,
|
||||
Selector: selector,
|
||||
MinDomains: ptr.Deref(c.MinDomains, 1), // If MinDomains is nil, we treat MinDomains as 1.
|
||||
NodeAffinityPolicy: v1.NodeInclusionPolicyHonor, // If NodeAffinityPolicy is nil, we treat NodeAffinityPolicy as "Honor".
|
||||
NodeTaintsPolicy: v1.NodeInclusionPolicyIgnore, // If NodeTaintsPolicy is nil, we treat NodeTaintsPolicy as "Ignore".
|
||||
}
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
if c.NodeAffinityPolicy != nil {
|
||||
tsc.NodeAffinityPolicy = *c.NodeAffinityPolicy
|
||||
}
|
||||
if c.NodeTaintsPolicy != nil {
|
||||
tsc.NodeTaintsPolicy = *c.NodeTaintsPolicy
|
||||
}
|
||||
}
|
||||
result = append(result, tsc)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func mergeLabelSetWithSelector(matchLabels labels.Set, s labels.Selector) labels.Selector {
|
||||
mergedSelector := labels.SelectorFromSet(matchLabels)
|
||||
|
||||
requirements, ok := s.Requirements()
|
||||
if !ok {
|
||||
return s
|
||||
}
|
||||
|
||||
for _, r := range requirements {
|
||||
mergedSelector = mergedSelector.Add(r)
|
||||
}
|
||||
|
||||
return mergedSelector
|
||||
}
|
||||
|
||||
func countPodsMatchSelector(podInfos []*framework.PodInfo, selector labels.Selector, ns string) int {
|
||||
if selector.Empty() {
|
||||
return 0
|
||||
}
|
||||
count := 0
|
||||
for _, p := range podInfos {
|
||||
// Bypass terminating Pod (see #87621).
|
||||
if p.Pod.DeletionTimestamp != nil || p.Pod.Namespace != ns {
|
||||
continue
|
||||
}
|
||||
if selector.Matches(labels.Set(p.Pod.Labels)) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// podLabelsMatchSpreadConstraints returns whether tha labels matches with the selector in any of topologySpreadConstraint
|
||||
func podLabelsMatchSpreadConstraints(constraints []topologySpreadConstraint, labels labels.Set) bool {
|
||||
for _, c := range constraints {
|
||||
if c.Selector.Matches(labels) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
371
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go
generated
vendored
Normal file
371
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go
generated
vendored
Normal file
@ -0,0 +1,371 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package podtopologyspread
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
const preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
// It combines TpKeyToCriticalPaths and TpPairToMatchNum to represent:
|
||||
// (1) critical paths where the least pods are matched on each spread constraint.
|
||||
// (2) number of pods matched on each spread constraint.
|
||||
// A nil preFilterState denotes it's not set at all (in PreFilter phase);
|
||||
// An empty preFilterState object denotes it's a legit state and is set in PreFilter phase.
|
||||
// Fields are exported for comparison during testing.
|
||||
type preFilterState struct {
|
||||
Constraints []topologySpreadConstraint
|
||||
// We record 2 critical paths instead of all critical paths here.
|
||||
// criticalPaths[0].MatchNum always holds the minimum matching number.
|
||||
// criticalPaths[1].MatchNum is always greater or equal to criticalPaths[0].MatchNum, but
|
||||
// it's not guaranteed to be the 2nd minimum match number.
|
||||
TpKeyToCriticalPaths map[string]*criticalPaths
|
||||
// TpKeyToDomainsNum is keyed with topologyKey, and valued with the number of domains.
|
||||
TpKeyToDomainsNum map[string]int
|
||||
// TpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
|
||||
TpPairToMatchNum map[topologyPair]int
|
||||
}
|
||||
|
||||
// minMatchNum returns the global minimum for the calculation of skew while taking MinDomains into account.
|
||||
func (s *preFilterState) minMatchNum(tpKey string, minDomains int32) (int, error) {
|
||||
paths, ok := s.TpKeyToCriticalPaths[tpKey]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("failed to retrieve path by topology key")
|
||||
}
|
||||
|
||||
minMatchNum := paths[0].MatchNum
|
||||
domainsNum, ok := s.TpKeyToDomainsNum[tpKey]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("failed to retrieve the number of domains by topology key")
|
||||
}
|
||||
|
||||
if domainsNum < int(minDomains) {
|
||||
// When the number of eligible domains with matching topology keys is less than `minDomains`,
|
||||
// it treats "global minimum" as 0.
|
||||
minMatchNum = 0
|
||||
}
|
||||
|
||||
return minMatchNum, nil
|
||||
}
|
||||
|
||||
// Clone makes a copy of the given state.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
copy := preFilterState{
|
||||
// Constraints are shared because they don't change.
|
||||
Constraints: s.Constraints,
|
||||
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(s.TpKeyToCriticalPaths)),
|
||||
// The number of domains does not change as a result of AddPod/RemovePod methods on PreFilter Extensions
|
||||
TpKeyToDomainsNum: s.TpKeyToDomainsNum,
|
||||
TpPairToMatchNum: make(map[topologyPair]int, len(s.TpPairToMatchNum)),
|
||||
}
|
||||
for tpKey, paths := range s.TpKeyToCriticalPaths {
|
||||
copy.TpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
|
||||
}
|
||||
for tpPair, matchNum := range s.TpPairToMatchNum {
|
||||
copy.TpPairToMatchNum[tpPair] = matchNum
|
||||
}
|
||||
return ©
|
||||
}
|
||||
|
||||
// CAVEAT: the reason that `[2]criticalPath` can work is based on the implementation of current
|
||||
// preemption algorithm, in particular the following 2 facts:
|
||||
// Fact 1: we only preempt pods on the same node, instead of pods on multiple nodes.
|
||||
// Fact 2: each node is evaluated on a separate copy of the preFilterState during its preemption cycle.
|
||||
// If we plan to turn to a more complex algorithm like "arbitrary pods on multiple nodes", this
|
||||
// structure needs to be revisited.
|
||||
// Fields are exported for comparison during testing.
|
||||
type criticalPaths [2]struct {
|
||||
// TopologyValue denotes the topology value mapping to topology key.
|
||||
TopologyValue string
|
||||
// MatchNum denotes the number of matching pods.
|
||||
MatchNum int
|
||||
}
|
||||
|
||||
func newCriticalPaths() *criticalPaths {
|
||||
return &criticalPaths{{MatchNum: math.MaxInt32}, {MatchNum: math.MaxInt32}}
|
||||
}
|
||||
|
||||
func (p *criticalPaths) update(tpVal string, num int) {
|
||||
// first verify if `tpVal` exists or not
|
||||
i := -1
|
||||
if tpVal == p[0].TopologyValue {
|
||||
i = 0
|
||||
} else if tpVal == p[1].TopologyValue {
|
||||
i = 1
|
||||
}
|
||||
|
||||
if i >= 0 {
|
||||
// `tpVal` exists
|
||||
p[i].MatchNum = num
|
||||
if p[0].MatchNum > p[1].MatchNum {
|
||||
// swap paths[0] and paths[1]
|
||||
p[0], p[1] = p[1], p[0]
|
||||
}
|
||||
} else {
|
||||
// `tpVal` doesn't exist
|
||||
if num < p[0].MatchNum {
|
||||
// update paths[1] with paths[0]
|
||||
p[1] = p[0]
|
||||
// update paths[0]
|
||||
p[0].TopologyValue, p[0].MatchNum = tpVal, num
|
||||
} else if num < p[1].MatchNum {
|
||||
// update paths[1]
|
||||
p[1].TopologyValue, p[1].MatchNum = tpVal, num
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point.
|
||||
func (pl *PodTopologySpread) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
s, err := pl.calPreFilterState(ctx, pod)
|
||||
if err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
} else if s != nil && len(s.Constraints) == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
cycleState.Write(preFilterStateKey, s)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *PodTopologySpread) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// AddPod from pre-computed data in cycleState.
|
||||
func (pl *PodTopologySpread) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
pl.updateWithPod(s, podInfoToAdd.Pod, podToSchedule, nodeInfo.Node(), 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemovePod from pre-computed data in cycleState.
|
||||
func (pl *PodTopologySpread) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
pl.updateWithPod(s, podInfoToRemove.Pod, podToSchedule, nodeInfo.Node(), -1)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemptorPod *v1.Pod, node *v1.Node, delta int) {
|
||||
if s == nil || updatedPod.Namespace != preemptorPod.Namespace || node == nil {
|
||||
return
|
||||
}
|
||||
if !nodeLabelsMatchSpreadConstraints(node.Labels, s.Constraints) {
|
||||
return
|
||||
}
|
||||
|
||||
requiredSchedulingTerm := nodeaffinity.GetRequiredNodeAffinity(preemptorPod)
|
||||
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
// spreading is applied to nodes that pass those filters.
|
||||
// Ignore parsing errors for backwards compatibility.
|
||||
if match, _ := requiredSchedulingTerm.Match(node); !match {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
podLabelSet := labels.Set(updatedPod.Labels)
|
||||
for _, constraint := range s.Constraints {
|
||||
if !constraint.Selector.Matches(podLabelSet) {
|
||||
continue
|
||||
}
|
||||
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
|
||||
!constraint.matchNodeInclusionPolicies(preemptorPod, node, requiredSchedulingTerm) {
|
||||
continue
|
||||
}
|
||||
|
||||
k, v := constraint.TopologyKey, node.Labels[constraint.TopologyKey]
|
||||
pair := topologyPair{key: k, value: v}
|
||||
s.TpPairToMatchNum[pair] += delta
|
||||
s.TpKeyToCriticalPaths[k].update(v, s.TpPairToMatchNum[pair])
|
||||
}
|
||||
}
|
||||
|
||||
// getPreFilterState fetches a pre-computed preFilterState.
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
|
||||
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to podtopologyspread.preFilterState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// calPreFilterState computes preFilterState describing how pods are spread on topologies.
|
||||
func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) (*preFilterState, error) {
|
||||
constraints, err := pl.getConstraints(pod)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get constraints from pod: %w", err)
|
||||
}
|
||||
if len(constraints) == 0 {
|
||||
return &preFilterState{}, nil
|
||||
}
|
||||
|
||||
allNodes, err := pl.sharedLister.NodeInfos().List()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("listing NodeInfos: %w", err)
|
||||
}
|
||||
|
||||
s := preFilterState{
|
||||
Constraints: constraints,
|
||||
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
|
||||
TpPairToMatchNum: make(map[topologyPair]int, sizeHeuristic(len(allNodes), constraints)),
|
||||
}
|
||||
|
||||
tpCountsByNode := make([]map[topologyPair]int, len(allNodes))
|
||||
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
// spreading is applied to nodes that pass those filters.
|
||||
// Ignore parsing errors for backwards compatibility.
|
||||
if match, _ := requiredNodeAffinity.Match(node); !match {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure current node's labels contains all topologyKeys in 'Constraints'.
|
||||
if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) {
|
||||
return
|
||||
}
|
||||
|
||||
tpCounts := make(map[topologyPair]int, len(constraints))
|
||||
for _, c := range constraints {
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
|
||||
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
|
||||
continue
|
||||
}
|
||||
|
||||
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
|
||||
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
|
||||
tpCounts[pair] = count
|
||||
}
|
||||
tpCountsByNode[i] = tpCounts
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
|
||||
|
||||
for _, tpCounts := range tpCountsByNode {
|
||||
for tp, count := range tpCounts {
|
||||
s.TpPairToMatchNum[tp] += count
|
||||
}
|
||||
}
|
||||
s.TpKeyToDomainsNum = make(map[string]int, len(constraints))
|
||||
for tp := range s.TpPairToMatchNum {
|
||||
s.TpKeyToDomainsNum[tp.key]++
|
||||
}
|
||||
|
||||
// calculate min match for each topology pair
|
||||
for i := 0; i < len(constraints); i++ {
|
||||
key := constraints[i].TopologyKey
|
||||
s.TpKeyToCriticalPaths[key] = newCriticalPaths()
|
||||
}
|
||||
for pair, num := range s.TpPairToMatchNum {
|
||||
s.TpKeyToCriticalPaths[pair.key].update(pair.value, num)
|
||||
}
|
||||
|
||||
return &s, nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
s, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
// However, "empty" preFilterState is legit which tolerates every toSchedule Pod.
|
||||
if len(s.Constraints) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
podLabelSet := labels.Set(pod.Labels)
|
||||
for _, c := range s.Constraints {
|
||||
tpKey := c.TopologyKey
|
||||
tpVal, ok := node.Labels[c.TopologyKey]
|
||||
if !ok {
|
||||
logger.V(5).Info("Node doesn't have required label", "node", klog.KObj(node), "label", tpKey)
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonNodeLabelNotMatch)
|
||||
}
|
||||
|
||||
// judging criteria:
|
||||
// 'existing matching num' + 'if self-match (1 or 0)' - 'global minimum' <= 'maxSkew'
|
||||
minMatchNum, err := s.minMatchNum(tpKey, c.MinDomains)
|
||||
if err != nil {
|
||||
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.TpKeyToCriticalPaths)
|
||||
continue
|
||||
}
|
||||
|
||||
selfMatchNum := 0
|
||||
if c.Selector.Matches(podLabelSet) {
|
||||
selfMatchNum = 1
|
||||
}
|
||||
|
||||
pair := topologyPair{key: tpKey, value: tpVal}
|
||||
matchNum := 0
|
||||
if tpCount, ok := s.TpPairToMatchNum[pair]; ok {
|
||||
matchNum = tpCount
|
||||
}
|
||||
skew := matchNum + selfMatchNum - minMatchNum
|
||||
if skew > int(c.MaxSkew) {
|
||||
logger.V(5).Info("Node failed spreadConstraint: matchNum + selfMatchNum - minMatchNum > maxSkew", "node", klog.KObj(node), "topologyKey", tpKey, "matchNum", matchNum, "selfMatchNum", selfMatchNum, "minMatchNum", minMatchNum, "maxSkew", c.MaxSkew)
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonConstraintsNotMatch)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func sizeHeuristic(nodes int, constraints []topologySpreadConstraint) int {
|
||||
for _, c := range constraints {
|
||||
if c.TopologyKey == v1.LabelHostname {
|
||||
return nodes
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
351
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go
generated
vendored
Normal file
351
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/plugin.go
generated
vendored
Normal file
@ -0,0 +1,351 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package podtopologyspread
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/equality"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/informers"
|
||||
appslisters "k8s.io/client-go/listers/apps/v1"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
const (
|
||||
// ErrReasonConstraintsNotMatch is used for PodTopologySpread filter error.
|
||||
ErrReasonConstraintsNotMatch = "node(s) didn't match pod topology spread constraints"
|
||||
// ErrReasonNodeLabelNotMatch is used when the node doesn't hold the required label.
|
||||
ErrReasonNodeLabelNotMatch = ErrReasonConstraintsNotMatch + " (missing required label)"
|
||||
)
|
||||
|
||||
var systemDefaultConstraints = []v1.TopologySpreadConstraint{
|
||||
{
|
||||
TopologyKey: v1.LabelHostname,
|
||||
WhenUnsatisfiable: v1.ScheduleAnyway,
|
||||
MaxSkew: 3,
|
||||
},
|
||||
{
|
||||
TopologyKey: v1.LabelTopologyZone,
|
||||
WhenUnsatisfiable: v1.ScheduleAnyway,
|
||||
MaxSkew: 5,
|
||||
},
|
||||
}
|
||||
|
||||
// PodTopologySpread is a plugin that ensures pod's topologySpreadConstraints is satisfied.
|
||||
type PodTopologySpread struct {
|
||||
systemDefaulted bool
|
||||
parallelizer parallelize.Parallelizer
|
||||
defaultConstraints []v1.TopologySpreadConstraint
|
||||
sharedLister framework.SharedLister
|
||||
services corelisters.ServiceLister
|
||||
replicationCtrls corelisters.ReplicationControllerLister
|
||||
replicaSets appslisters.ReplicaSetLister
|
||||
statefulSets appslisters.StatefulSetLister
|
||||
enableNodeInclusionPolicyInPodTopologySpread bool
|
||||
enableMatchLabelKeysInPodTopologySpread bool
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &PodTopologySpread{}
|
||||
var _ framework.FilterPlugin = &PodTopologySpread{}
|
||||
var _ framework.PreScorePlugin = &PodTopologySpread{}
|
||||
var _ framework.ScorePlugin = &PodTopologySpread{}
|
||||
var _ framework.EnqueueExtensions = &PodTopologySpread{}
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.PodTopologySpread
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *PodTopologySpread) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
if h.SnapshotSharedLister() == nil {
|
||||
return nil, fmt.Errorf("SnapshotSharedlister is nil")
|
||||
}
|
||||
args, err := getArgs(plArgs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := validation.ValidatePodTopologySpreadArgs(nil, &args); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pl := &PodTopologySpread{
|
||||
parallelizer: h.Parallelizer(),
|
||||
sharedLister: h.SnapshotSharedLister(),
|
||||
defaultConstraints: args.DefaultConstraints,
|
||||
enableNodeInclusionPolicyInPodTopologySpread: fts.EnableNodeInclusionPolicyInPodTopologySpread,
|
||||
enableMatchLabelKeysInPodTopologySpread: fts.EnableMatchLabelKeysInPodTopologySpread,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}
|
||||
if args.DefaultingType == config.SystemDefaulting {
|
||||
pl.defaultConstraints = systemDefaultConstraints
|
||||
pl.systemDefaulted = true
|
||||
}
|
||||
if len(pl.defaultConstraints) != 0 {
|
||||
if h.SharedInformerFactory() == nil {
|
||||
return nil, fmt.Errorf("SharedInformerFactory is nil")
|
||||
}
|
||||
pl.setListers(h.SharedInformerFactory())
|
||||
}
|
||||
return pl, nil
|
||||
}
|
||||
|
||||
func getArgs(obj runtime.Object) (config.PodTopologySpreadArgs, error) {
|
||||
ptr, ok := obj.(*config.PodTopologySpreadArgs)
|
||||
if !ok {
|
||||
return config.PodTopologySpreadArgs{}, fmt.Errorf("want args to be of type PodTopologySpreadArgs, got %T", obj)
|
||||
}
|
||||
return *ptr, nil
|
||||
}
|
||||
|
||||
func (pl *PodTopologySpread) setListers(factory informers.SharedInformerFactory) {
|
||||
pl.services = factory.Core().V1().Services().Lister()
|
||||
pl.replicationCtrls = factory.Core().V1().ReplicationControllers().Lister()
|
||||
pl.replicaSets = factory.Apps().V1().ReplicaSets().Lister()
|
||||
pl.statefulSets = factory.Apps().V1().StatefulSets().Lister()
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
podActionType := framework.Add | framework.UpdatePodLabel | framework.Delete
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// When the QueueingHint feature is enabled, the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
// (If not, the scheduling queue always retries the unschedulable Pods when they're updated.)
|
||||
//
|
||||
// The Pod rejected by this plugin can be schedulable when the Pod has a spread constraint with NodeTaintsPolicy:Honor
|
||||
// and has got a new toleration.
|
||||
// So, we add UpdatePodTolerations here only when QHint is enabled.
|
||||
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodTolerations | framework.Delete
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// All ActionType includes the following events:
|
||||
// - Add. An unschedulable Pod may fail due to violating topology spread constraints,
|
||||
// adding an assigned Pod may make it schedulable.
|
||||
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
|
||||
// an unschedulable Pod schedulable.
|
||||
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's topology spread constraints,
|
||||
// deleting an existing Pod may make it schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: pl.isSchedulableAfterPodChange},
|
||||
// Node add|delete|update maybe lead an topology key changed,
|
||||
// and make these pod in scheduling schedulable or unschedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// involvedInTopologySpreading returns true if the incomingPod is involved in the topology spreading of podWithSpreading.
|
||||
func involvedInTopologySpreading(incomingPod, podWithSpreading *v1.Pod) bool {
|
||||
return incomingPod.UID == podWithSpreading.UID ||
|
||||
(incomingPod.Spec.NodeName != "" && incomingPod.Namespace == podWithSpreading.Namespace)
|
||||
}
|
||||
|
||||
// hasConstraintWithNodeTaintsPolicyHonor returns true if any constraint has `NodeTaintsPolicy: Honor`.
|
||||
func hasConstraintWithNodeTaintsPolicyHonor(constraints []topologySpreadConstraint) bool {
|
||||
for _, c := range constraints {
|
||||
if c.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (pl *PodTopologySpread) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if (modifiedPod != nil && !involvedInTopologySpreading(modifiedPod, pod)) || (originalPod != nil && !involvedInTopologySpreading(originalPod, pod)) {
|
||||
logger.V(5).Info("the added/updated/deleted pod is unscheduled or has different namespace with target pod, so it doesn't make the target pod schedulable",
|
||||
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
constraints, err := pl.getConstraints(pod)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// Pod is modified. Return Queue when the label(s) matching topologySpread's selector is added, changed, or deleted.
|
||||
if modifiedPod != nil && originalPod != nil {
|
||||
if pod.UID == modifiedPod.UID && !equality.Semantic.DeepEqual(modifiedPod.Spec.Tolerations, originalPod.Spec.Tolerations) && hasConstraintWithNodeTaintsPolicyHonor(constraints) {
|
||||
// If any constraint has `NodeTaintsPolicy: Honor`, we can return Queue when the target Pod has got a new toleration.
|
||||
logger.V(5).Info("the unschedulable pod has got a new toleration, which could make it schedulable",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if equality.Semantic.DeepEqual(modifiedPod.Labels, originalPod.Labels) {
|
||||
logger.V(5).Info("the pod's update doesn't include the label update, which doesn't make the target pod schedulable",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
for _, c := range constraints {
|
||||
if c.Selector.Matches(labels.Set(originalPod.Labels)) != c.Selector.Matches(labels.Set(modifiedPod.Labels)) {
|
||||
// This modification makes this Pod match(or not match) with this constraint.
|
||||
// Maybe now the scheduling result of topology spread gets changed by this change.
|
||||
logger.V(5).Info("a scheduled pod's label was updated and it makes the updated pod match or unmatch the pod's topology spread constraints",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
// This modification of labels doesn't change whether this Pod would match selector or not in any constraints.
|
||||
logger.V(5).Info("a scheduled pod's label was updated, but it's a change unrelated to the pod's topology spread constraints",
|
||||
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Pod is added. Return Queue when the added Pod has a label that matches with topologySpread's selector.
|
||||
if modifiedPod != nil {
|
||||
if podLabelsMatchSpreadConstraints(constraints, modifiedPod.Labels) {
|
||||
logger.V(5).Info("a scheduled pod was created and it matches with the pod's topology spread constraints",
|
||||
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
logger.V(5).Info("a scheduled pod was created, but it doesn't matches with the pod's topology spread constraints",
|
||||
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Pod is deleted. Return Queue when the deleted Pod has a label that matches with topologySpread's selector.
|
||||
if podLabelsMatchSpreadConstraints(constraints, originalPod.Labels) {
|
||||
logger.V(5).Info("a scheduled pod which matches with the pod's topology spread constraints was deleted, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
logger.V(5).Info("a scheduled pod was deleted, but it's unrelated to the pod's topology spread constraints",
|
||||
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
|
||||
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// getConstraints extracts topologySpreadConstraint(s) from the Pod spec.
|
||||
// If the Pod doesn't have any topologySpreadConstraint, it returns default constraints.
|
||||
func (pl *PodTopologySpread) getConstraints(pod *v1.Pod) ([]topologySpreadConstraint, error) {
|
||||
var constraints []topologySpreadConstraint
|
||||
var err error
|
||||
if len(pod.Spec.TopologySpreadConstraints) > 0 {
|
||||
// We have feature gating in APIServer to strip the spec
|
||||
// so don't need to re-check feature gate, just check length of Constraints.
|
||||
constraints, err = pl.filterTopologySpreadConstraints(
|
||||
pod.Spec.TopologySpreadConstraints,
|
||||
pod.Labels,
|
||||
v1.DoNotSchedule,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obtaining pod's hard topology spread constraints: %w", err)
|
||||
}
|
||||
} else {
|
||||
constraints, err = pl.buildDefaultConstraints(pod, v1.DoNotSchedule)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("setting default hard topology spread constraints: %w", err)
|
||||
}
|
||||
}
|
||||
return constraints, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange returns Queue when node has topologyKey in its labels, else return QueueSkip.
|
||||
func (pl *PodTopologySpread) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
constraints, err := pl.getConstraints(pod)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
var originalNodeMatching, modifiedNodeMatching bool
|
||||
if originalNode != nil {
|
||||
originalNodeMatching = nodeLabelsMatchSpreadConstraints(originalNode.Labels, constraints)
|
||||
}
|
||||
if modifiedNode != nil {
|
||||
modifiedNodeMatching = nodeLabelsMatchSpreadConstraints(modifiedNode.Labels, constraints)
|
||||
}
|
||||
|
||||
// We return Queue in the following cases:
|
||||
// 1. Node/UpdateNodeLabel:
|
||||
// - The original node matched the pod's topology spread constraints, but the modified node does not.
|
||||
// - The modified node matches the pod's topology spread constraints, but the original node does not.
|
||||
// - The modified node matches the pod's topology spread constraints, and the original node and the modified node have different label values for any topologyKey.
|
||||
// 2. Node/UpdateNodeTaint:
|
||||
// - The modified node match the pod's topology spread constraints, and the original node and the modified node have different taints.
|
||||
// 3. Node/Add: The created node matches the pod's topology spread constraints.
|
||||
// 4. Node/Delete: The original node matched the pod's topology spread constraints.
|
||||
if originalNode != nil && modifiedNode != nil {
|
||||
if originalNodeMatching != modifiedNodeMatching {
|
||||
logger.V(5).Info("the node is updated and now pod topology spread constraints has changed, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode), "originalMatching", originalNodeMatching, "newMatching", modifiedNodeMatching)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
if modifiedNodeMatching && (checkTopologyKeyLabelsChanged(originalNode.Labels, modifiedNode.Labels, constraints) || !equality.Semantic.DeepEqual(originalNode.Spec.Taints, modifiedNode.Spec.Taints)) {
|
||||
logger.V(5).Info("the node is updated and now has different taints or labels, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if modifiedNode != nil {
|
||||
if !modifiedNodeMatching {
|
||||
logger.V(5).Info("the created node doesn't match pod topology spread constraints",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
logger.V(5).Info("the created node matches topology spread constraints, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if !originalNodeMatching {
|
||||
logger.V(5).Info("the deleted node doesn't match pod topology spread constraints", "pod", klog.KObj(pod), "node", klog.KObj(originalNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
logger.V(5).Info("the deleted node matches topology spread constraints, and the pod may be schedulable now",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(originalNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// checkTopologyKeyLabelsChanged checks if any of the labels specified as topologyKey in the constraints have changed.
|
||||
func checkTopologyKeyLabelsChanged(originalLabels, modifiedLabels map[string]string, constraints []topologySpreadConstraint) bool {
|
||||
for _, constraint := range constraints {
|
||||
topologyKey := constraint.TopologyKey
|
||||
if originalLabels[topologyKey] != modifiedLabels[topologyKey] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
305
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/scoring.go
generated
vendored
Normal file
305
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/scoring.go
generated
vendored
Normal file
@ -0,0 +1,305 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package podtopologyspread
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync/atomic"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
const preScoreStateKey = "PreScore" + Name
|
||||
const invalidScore = -1
|
||||
|
||||
// preScoreState computed at PreScore and used at Score.
|
||||
// Fields are exported for comparison during testing.
|
||||
type preScoreState struct {
|
||||
Constraints []topologySpreadConstraint
|
||||
// IgnoredNodes is a set of node names which miss some Constraints[*].topologyKey.
|
||||
IgnoredNodes sets.Set[string]
|
||||
// TopologyPairToPodCounts is keyed with topologyPair, and valued with the number of matching pods.
|
||||
TopologyPairToPodCounts map[topologyPair]*int64
|
||||
// TopologyNormalizingWeight is the weight we give to the counts per topology.
|
||||
// This allows the pod counts of smaller topologies to not be watered down by
|
||||
// bigger ones.
|
||||
TopologyNormalizingWeight []float64
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *preScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// initPreScoreState iterates "filteredNodes" to filter out the nodes which
|
||||
// don't have required topologyKey(s), and initialize:
|
||||
// 1) s.TopologyPairToPodCounts: keyed with both eligible topology pair and node names.
|
||||
// 2) s.IgnoredNodes: the set of nodes that shouldn't be scored.
|
||||
// 3) s.TopologyNormalizingWeight: The weight to be given to each constraint based on the number of values in a topology.
|
||||
func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, filteredNodes []*framework.NodeInfo, requireAllTopologies bool) error {
|
||||
var err error
|
||||
if len(pod.Spec.TopologySpreadConstraints) > 0 {
|
||||
s.Constraints, err = pl.filterTopologySpreadConstraints(
|
||||
pod.Spec.TopologySpreadConstraints,
|
||||
pod.Labels,
|
||||
v1.ScheduleAnyway,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("obtaining pod's soft topology spread constraints: %w", err)
|
||||
}
|
||||
} else {
|
||||
s.Constraints, err = pl.buildDefaultConstraints(pod, v1.ScheduleAnyway)
|
||||
if err != nil {
|
||||
return fmt.Errorf("setting default soft topology spread constraints: %w", err)
|
||||
}
|
||||
}
|
||||
if len(s.Constraints) == 0 {
|
||||
return nil
|
||||
}
|
||||
topoSize := make([]int, len(s.Constraints))
|
||||
for _, node := range filteredNodes {
|
||||
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Node().Labels, s.Constraints) {
|
||||
// Nodes which don't have all required topologyKeys present are ignored
|
||||
// when scoring later.
|
||||
s.IgnoredNodes.Insert(node.Node().Name)
|
||||
continue
|
||||
}
|
||||
for i, constraint := range s.Constraints {
|
||||
// per-node counts are calculated during Score.
|
||||
if constraint.TopologyKey == v1.LabelHostname {
|
||||
continue
|
||||
}
|
||||
pair := topologyPair{key: constraint.TopologyKey, value: node.Node().Labels[constraint.TopologyKey]}
|
||||
if s.TopologyPairToPodCounts[pair] == nil {
|
||||
s.TopologyPairToPodCounts[pair] = new(int64)
|
||||
topoSize[i]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
s.TopologyNormalizingWeight = make([]float64, len(s.Constraints))
|
||||
for i, c := range s.Constraints {
|
||||
sz := topoSize[i]
|
||||
if c.TopologyKey == v1.LabelHostname {
|
||||
sz = len(filteredNodes) - len(s.IgnoredNodes)
|
||||
}
|
||||
s.TopologyNormalizingWeight[i] = topologyNormalizingWeight(sz)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *PodTopologySpread) PreScore(
|
||||
ctx context.Context,
|
||||
cycleState *framework.CycleState,
|
||||
pod *v1.Pod,
|
||||
filteredNodes []*framework.NodeInfo,
|
||||
) *framework.Status {
|
||||
allNodes, err := pl.sharedLister.NodeInfos().List()
|
||||
if err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("getting all nodes: %w", err))
|
||||
}
|
||||
|
||||
if len(allNodes) == 0 {
|
||||
// No need to score.
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
state := &preScoreState{
|
||||
IgnoredNodes: sets.New[string](),
|
||||
TopologyPairToPodCounts: make(map[topologyPair]*int64),
|
||||
}
|
||||
// Only require that nodes have all the topology labels if using
|
||||
// non-system-default spreading rules. This allows nodes that don't have a
|
||||
// zone label to still have hostname spreading.
|
||||
requireAllTopologies := len(pod.Spec.TopologySpreadConstraints) > 0 || !pl.systemDefaulted
|
||||
err = pl.initPreScoreState(state, pod, filteredNodes, requireAllTopologies)
|
||||
if err != nil {
|
||||
return framework.AsStatus(fmt.Errorf("calculating preScoreState: %w", err))
|
||||
}
|
||||
|
||||
// return Skip if incoming pod doesn't have soft topology spread Constraints.
|
||||
if len(state.Constraints) == 0 {
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// Ignore parsing errors for backwards compatibility.
|
||||
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
|
||||
processAllNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
// `node` should satisfy incoming pod's NodeSelector/NodeAffinity
|
||||
if match, _ := requiredNodeAffinity.Match(node); !match {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// All topologyKeys need to be present in `node`
|
||||
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Labels, state.Constraints) {
|
||||
return
|
||||
}
|
||||
|
||||
for _, c := range state.Constraints {
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
|
||||
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
|
||||
continue
|
||||
}
|
||||
|
||||
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
|
||||
// If current topology pair is not associated with any candidate node,
|
||||
// continue to avoid unnecessary calculation.
|
||||
// Per-node counts are also skipped, as they are done during Score.
|
||||
tpCount := state.TopologyPairToPodCounts[pair]
|
||||
if tpCount == nil {
|
||||
continue
|
||||
}
|
||||
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
|
||||
atomic.AddInt64(tpCount, int64(count))
|
||||
}
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(allNodes), processAllNode, pl.Name())
|
||||
|
||||
cycleState.Write(preScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
// The "score" returned in this function is the matching number of pods on the `nodeName`,
|
||||
// it is normalized later.
|
||||
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
node := nodeInfo.Node()
|
||||
s, err := getPreScoreState(cycleState)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
// Return if the node is not qualified.
|
||||
if s.IgnoredNodes.Has(node.Name) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// For each present <pair>, current node gets a credit of <matchSum>.
|
||||
// And we sum up <matchSum> and return it as this node's score.
|
||||
var score float64
|
||||
for i, c := range s.Constraints {
|
||||
if tpVal, ok := node.Labels[c.TopologyKey]; ok {
|
||||
var cnt int64
|
||||
if c.TopologyKey == v1.LabelHostname {
|
||||
cnt = int64(countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace))
|
||||
} else {
|
||||
pair := topologyPair{key: c.TopologyKey, value: tpVal}
|
||||
cnt = *s.TopologyPairToPodCounts[pair]
|
||||
}
|
||||
score += scoreForCount(cnt, c.MaxSkew, s.TopologyNormalizingWeight[i])
|
||||
}
|
||||
}
|
||||
return int64(math.Round(score)), nil
|
||||
}
|
||||
|
||||
// NormalizeScore invoked after scoring all nodes.
|
||||
func (pl *PodTopologySpread) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
|
||||
s, err := getPreScoreState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Calculate <minScore> and <maxScore>
|
||||
var minScore int64 = math.MaxInt64
|
||||
var maxScore int64
|
||||
for i, score := range scores {
|
||||
// it's mandatory to check if <score.Name> is present in m.IgnoredNodes
|
||||
if s.IgnoredNodes.Has(score.Name) {
|
||||
scores[i].Score = invalidScore
|
||||
continue
|
||||
}
|
||||
if score.Score < minScore {
|
||||
minScore = score.Score
|
||||
}
|
||||
if score.Score > maxScore {
|
||||
maxScore = score.Score
|
||||
}
|
||||
}
|
||||
|
||||
for i := range scores {
|
||||
if scores[i].Score == invalidScore {
|
||||
scores[i].Score = 0
|
||||
continue
|
||||
}
|
||||
if maxScore == 0 {
|
||||
scores[i].Score = framework.MaxNodeScore
|
||||
continue
|
||||
}
|
||||
s := scores[i].Score
|
||||
scores[i].Score = framework.MaxNodeScore * (maxScore + minScore - s) / maxScore
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *PodTopologySpread) ScoreExtensions() framework.ScoreExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
|
||||
c, err := cycleState.Read(preScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading %q from cycleState: %w", preScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to podtopologyspread.preScoreState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// topologyNormalizingWeight calculates the weight for the topology, based on
|
||||
// the number of values that exist for a topology.
|
||||
// Since <size> is at least 1 (all nodes that passed the Filters are in the
|
||||
// same topology), and k8s supports 5k nodes, the result is in the interval
|
||||
// <1.09, 8.52>.
|
||||
//
|
||||
// Note: <size> could also be zero when no nodes have the required topologies,
|
||||
// however we don't care about topology weight in this case as we return a 0
|
||||
// score for all nodes.
|
||||
func topologyNormalizingWeight(size int) float64 {
|
||||
return math.Log(float64(size + 2))
|
||||
}
|
||||
|
||||
// scoreForCount calculates the score based on number of matching pods in a
|
||||
// topology domain, the constraint's maxSkew and the topology weight.
|
||||
// `maxSkew-1` is added to the score so that differences between topology
|
||||
// domains get watered down, controlling the tolerance of the score to skews.
|
||||
func scoreForCount(cnt int64, maxSkew int32, tpWeight float64) float64 {
|
||||
return float64(cnt)*tpWeight + float64(maxSkew-1)
|
||||
}
|
53
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort/priority_sort.go
generated
vendored
Normal file
53
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort/priority_sort.go
generated
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package queuesort
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
)
|
||||
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.PrioritySort
|
||||
|
||||
// PrioritySort is a plugin that implements Priority based sorting.
|
||||
type PrioritySort struct{}
|
||||
|
||||
var _ framework.QueueSortPlugin = &PrioritySort{}
|
||||
|
||||
// Name returns name of the plugin.
|
||||
func (pl *PrioritySort) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// Less is the function used by the activeQ heap algorithm to sort pods.
|
||||
// It sorts pods based on their priority. When priorities are equal, it uses
|
||||
// PodQueueInfo.timestamp.
|
||||
func (pl *PrioritySort) Less(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
|
||||
p1 := corev1helpers.PodPriority(pInfo1.Pod)
|
||||
p2 := corev1helpers.PodPriority(pInfo2.Pod)
|
||||
return (p1 > p2) || (p1 == p2 && pInfo1.Timestamp.Before(pInfo2.Timestamp))
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
|
||||
return &PrioritySort{}, nil
|
||||
}
|
84
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/registry.go
generated
vendored
Normal file
84
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/registry.go
generated
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package plugins
|
||||
|
||||
import (
|
||||
"k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources"
|
||||
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
|
||||
)
|
||||
|
||||
// NewInTreeRegistry builds the registry with all the in-tree plugins.
|
||||
// A scheduler that runs out of tree plugins can register additional plugins
|
||||
// through the WithFrameworkOutOfTreeRegistry option.
|
||||
func NewInTreeRegistry() runtime.Registry {
|
||||
fts := plfeature.Features{
|
||||
EnableDRAAdminAccess: feature.DefaultFeatureGate.Enabled(features.DRAAdminAccess),
|
||||
EnableDynamicResourceAllocation: feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
|
||||
EnableVolumeCapacityPriority: feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
|
||||
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
|
||||
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
|
||||
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
|
||||
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
||||
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
|
||||
EnablePodLevelResources: feature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
}
|
||||
|
||||
registry := runtime.Registry{
|
||||
dynamicresources.Name: runtime.FactoryAdapter(fts, dynamicresources.New),
|
||||
imagelocality.Name: imagelocality.New,
|
||||
tainttoleration.Name: runtime.FactoryAdapter(fts, tainttoleration.New),
|
||||
nodename.Name: runtime.FactoryAdapter(fts, nodename.New),
|
||||
nodeports.Name: runtime.FactoryAdapter(fts, nodeports.New),
|
||||
nodeaffinity.Name: runtime.FactoryAdapter(fts, nodeaffinity.New),
|
||||
podtopologyspread.Name: runtime.FactoryAdapter(fts, podtopologyspread.New),
|
||||
nodeunschedulable.Name: runtime.FactoryAdapter(fts, nodeunschedulable.New),
|
||||
noderesources.Name: runtime.FactoryAdapter(fts, noderesources.NewFit),
|
||||
noderesources.BalancedAllocationName: runtime.FactoryAdapter(fts, noderesources.NewBalancedAllocation),
|
||||
volumebinding.Name: runtime.FactoryAdapter(fts, volumebinding.New),
|
||||
volumerestrictions.Name: runtime.FactoryAdapter(fts, volumerestrictions.New),
|
||||
volumezone.Name: runtime.FactoryAdapter(fts, volumezone.New),
|
||||
nodevolumelimits.CSIName: runtime.FactoryAdapter(fts, nodevolumelimits.NewCSI),
|
||||
interpodaffinity.Name: runtime.FactoryAdapter(fts, interpodaffinity.New),
|
||||
queuesort.Name: queuesort.New,
|
||||
defaultbinder.Name: defaultbinder.New,
|
||||
defaultpreemption.Name: runtime.FactoryAdapter(fts, defaultpreemption.New),
|
||||
schedulinggates.Name: runtime.FactoryAdapter(fts, schedulinggates.New),
|
||||
}
|
||||
|
||||
return registry
|
||||
}
|
94
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates/scheduling_gates.go
generated
vendored
Normal file
94
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates/scheduling_gates.go
generated
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright 2022 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package schedulinggates
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// Name of the plugin used in the plugin registry and configurations.
|
||||
const Name = names.SchedulingGates
|
||||
|
||||
// SchedulingGates checks if a Pod carries .spec.schedulingGates.
|
||||
type SchedulingGates struct {
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.PreEnqueuePlugin = &SchedulingGates{}
|
||||
var _ framework.EnqueueExtensions = &SchedulingGates{}
|
||||
|
||||
func (pl *SchedulingGates) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
func (pl *SchedulingGates) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
|
||||
if len(p.Spec.SchedulingGates) == 0 {
|
||||
return nil
|
||||
}
|
||||
gates := make([]string, 0, len(p.Spec.SchedulingGates))
|
||||
for _, gate := range p.Spec.SchedulingGates {
|
||||
gates = append(gates, gate.Name)
|
||||
}
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("waiting for scheduling gates: %v", gates))
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *SchedulingGates) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
if !pl.enableSchedulingQueueHint {
|
||||
return nil, nil
|
||||
}
|
||||
// When the QueueingHint feature is enabled,
|
||||
// the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
return []framework.ClusterEventWithHint{
|
||||
// Pods can be more schedulable once it's gates are removed
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodSchedulingGatesEliminated}, QueueingHintFn: pl.isSchedulableAfterUpdatePodSchedulingGatesEliminated},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
return &SchedulingGates{
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (pl *SchedulingGates) isSchedulableAfterUpdatePodSchedulingGatesEliminated(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if modifiedPod.UID != pod.UID {
|
||||
// If the update event is not for targetPod, it wouldn't make targetPod schedulable.
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
return framework.Queue, nil
|
||||
}
|
236
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration/taint_toleration.go
generated
vendored
Normal file
236
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration/taint_toleration.go
generated
vendored
Normal file
@ -0,0 +1,236 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package tainttoleration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
v1helper "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// TaintToleration is a plugin that checks if a pod tolerates a node's taints.
|
||||
type TaintToleration struct {
|
||||
handle framework.Handle
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.FilterPlugin = &TaintToleration{}
|
||||
var _ framework.PreScorePlugin = &TaintToleration{}
|
||||
var _ framework.ScorePlugin = &TaintToleration{}
|
||||
var _ framework.EnqueueExtensions = &TaintToleration{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.TaintToleration
|
||||
// preScoreStateKey is the key in CycleState to TaintToleration pre-computed data for Scoring.
|
||||
preScoreStateKey = "PreScore" + Name
|
||||
// ErrReasonNotMatch is the Filter reason status when not matching.
|
||||
ErrReasonNotMatch = "node(s) had taints that the pod didn't tolerate"
|
||||
)
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *TaintToleration) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *TaintToleration) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
if pl.enableSchedulingQueueHint {
|
||||
return []framework.ClusterEventWithHint{
|
||||
// When the QueueingHint feature is enabled, preCheck is eliminated and we don't need additional UpdateNodeLabel.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
// When the QueueingHint feature is enabled,
|
||||
// the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// A note about UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
|
||||
// No need to register the Pod event; the update to the unschedulable Pods already triggers the scheduling retry when QHint is disabled.
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterNodeChange is invoked for all node events reported by
|
||||
// an informer. It checks whether that change made a previously unschedulable
|
||||
// pod schedulable.
|
||||
func (pl *TaintToleration) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
wasUntolerated := true
|
||||
if originalNode != nil {
|
||||
_, wasUntolerated = v1helper.FindMatchingUntoleratedTaint(originalNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
|
||||
}
|
||||
|
||||
_, isUntolerated := v1helper.FindMatchingUntoleratedTaint(modifiedNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
|
||||
|
||||
if wasUntolerated && !isUntolerated {
|
||||
logger.V(5).Info("node was created or updated, and this may make the Pod rejected by TaintToleration plugin in the previous scheduling cycle schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("node was created or updated, but it doesn't change the TaintToleration plugin's decision", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
func (pl *TaintToleration) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
taint, isUntolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
|
||||
if !isUntolerated {
|
||||
return nil
|
||||
}
|
||||
|
||||
errReason := fmt.Sprintf("node(s) had untolerated taint {%s: %s}", taint.Key, taint.Value)
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReason)
|
||||
}
|
||||
|
||||
// preScoreState computed at PreScore and used at Score.
|
||||
type preScoreState struct {
|
||||
tolerationsPreferNoSchedule []v1.Toleration
|
||||
}
|
||||
|
||||
// Clone implements the mandatory Clone interface. We don't really copy the data since
|
||||
// there is no need for that.
|
||||
func (s *preScoreState) Clone() framework.StateData {
|
||||
return s
|
||||
}
|
||||
|
||||
// getAllTolerationEffectPreferNoSchedule gets the list of all Tolerations with Effect PreferNoSchedule or with no effect.
|
||||
func getAllTolerationPreferNoSchedule(tolerations []v1.Toleration) (tolerationList []v1.Toleration) {
|
||||
for _, toleration := range tolerations {
|
||||
// Empty effect means all effects which includes PreferNoSchedule, so we need to collect it as well.
|
||||
if len(toleration.Effect) == 0 || toleration.Effect == v1.TaintEffectPreferNoSchedule {
|
||||
tolerationList = append(tolerationList, toleration)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *TaintToleration) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
tolerationsPreferNoSchedule := getAllTolerationPreferNoSchedule(pod.Spec.Tolerations)
|
||||
state := &preScoreState{
|
||||
tolerationsPreferNoSchedule: tolerationsPreferNoSchedule,
|
||||
}
|
||||
cycleState.Write(preScoreStateKey, state)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
|
||||
c, err := cycleState.Read(preScoreStateKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
|
||||
}
|
||||
|
||||
s, ok := c.(*preScoreState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to tainttoleration.preScoreState error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
|
||||
func countIntolerableTaintsPreferNoSchedule(taints []v1.Taint, tolerations []v1.Toleration) (intolerableTaints int) {
|
||||
for _, taint := range taints {
|
||||
// check only on taints that have effect PreferNoSchedule
|
||||
if taint.Effect != v1.TaintEffectPreferNoSchedule {
|
||||
continue
|
||||
}
|
||||
|
||||
if !v1helper.TolerationsTolerateTaint(tolerations, &taint) {
|
||||
intolerableTaints++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
node := nodeInfo.Node()
|
||||
|
||||
s, err := getPreScoreState(state)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
score := int64(countIntolerableTaintsPreferNoSchedule(node.Spec.Taints, s.tolerationsPreferNoSchedule))
|
||||
return score, nil
|
||||
}
|
||||
|
||||
// NormalizeScore invoked after scoring all nodes.
|
||||
func (pl *TaintToleration) NormalizeScore(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
|
||||
return helper.DefaultNormalizeScore(framework.MaxNodeScore, true, scores)
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *TaintToleration) ScoreExtensions() framework.ScoreExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
return &TaintToleration{
|
||||
handle: h,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
|
||||
func (pl *TaintToleration) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if pod.UID == modifiedPod.UID {
|
||||
// The updated Pod is the unschedulable Pod.
|
||||
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
|
||||
|
||||
return framework.QueueSkip, nil
|
||||
}
|
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/OWNERS
generated
vendored
Normal file
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/OWNERS
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
approvers:
|
||||
- sig-storage-approvers
|
||||
- cofyc
|
||||
reviewers:
|
||||
- sig-storage-reviewers
|
||||
- cofyc
|
||||
labels:
|
||||
- sig/storage
|
131
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/assume_cache.go
generated
vendored
Normal file
131
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/assume_cache.go
generated
vendored
Normal file
@ -0,0 +1,131 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumebinding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagehelpers "k8s.io/component-helpers/storage/volume"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
|
||||
)
|
||||
|
||||
// PVAssumeCache is a AssumeCache for PersistentVolume objects
|
||||
type PVAssumeCache struct {
|
||||
*assumecache.AssumeCache
|
||||
logger klog.Logger
|
||||
}
|
||||
|
||||
func pvStorageClassIndexFunc(obj interface{}) ([]string, error) {
|
||||
if pv, ok := obj.(*v1.PersistentVolume); ok {
|
||||
return []string{storagehelpers.GetPersistentVolumeClass(pv)}, nil
|
||||
}
|
||||
return []string{""}, fmt.Errorf("object is not a v1.PersistentVolume: %v", obj)
|
||||
}
|
||||
|
||||
// NewPVAssumeCache creates a PV assume cache.
|
||||
func NewPVAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVAssumeCache {
|
||||
logger = klog.LoggerWithName(logger, "PV Cache")
|
||||
return &PVAssumeCache{
|
||||
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolume", "storageclass", pvStorageClassIndexFunc),
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *PVAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) {
|
||||
obj, err := c.Get(pvName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pv, ok := obj.(*v1.PersistentVolume)
|
||||
if !ok {
|
||||
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
|
||||
}
|
||||
return pv, nil
|
||||
}
|
||||
|
||||
func (c *PVAssumeCache) GetAPIPV(pvName string) (*v1.PersistentVolume, error) {
|
||||
obj, err := c.GetAPIObj(pvName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pv, ok := obj.(*v1.PersistentVolume)
|
||||
if !ok {
|
||||
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
|
||||
}
|
||||
return pv, nil
|
||||
}
|
||||
|
||||
func (c *PVAssumeCache) ListPVs(storageClassName string) []*v1.PersistentVolume {
|
||||
objs := c.List(&v1.PersistentVolume{
|
||||
Spec: v1.PersistentVolumeSpec{
|
||||
StorageClassName: storageClassName,
|
||||
},
|
||||
})
|
||||
pvs := []*v1.PersistentVolume{}
|
||||
for _, obj := range objs {
|
||||
pv, ok := obj.(*v1.PersistentVolume)
|
||||
if !ok {
|
||||
c.logger.Error(&assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}, "ListPVs")
|
||||
continue
|
||||
}
|
||||
pvs = append(pvs, pv)
|
||||
}
|
||||
return pvs
|
||||
}
|
||||
|
||||
// PVCAssumeCache is a AssumeCache for PersistentVolumeClaim objects
|
||||
type PVCAssumeCache struct {
|
||||
*assumecache.AssumeCache
|
||||
logger klog.Logger
|
||||
}
|
||||
|
||||
// NewPVCAssumeCache creates a PVC assume cache.
|
||||
func NewPVCAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVCAssumeCache {
|
||||
logger = klog.LoggerWithName(logger, "PVC Cache")
|
||||
return &PVCAssumeCache{
|
||||
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolumeClaim", "", nil),
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *PVCAssumeCache) GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
|
||||
obj, err := c.Get(pvcKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pvc, ok := obj.(*v1.PersistentVolumeClaim)
|
||||
if !ok {
|
||||
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
|
||||
}
|
||||
return pvc, nil
|
||||
}
|
||||
|
||||
func (c *PVCAssumeCache) GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
|
||||
obj, err := c.GetAPIObj(pvcKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pvc, ok := obj.(*v1.PersistentVolumeClaim)
|
||||
if !ok {
|
||||
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
|
||||
}
|
||||
return pvc, nil
|
||||
}
|
1100
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/binder.go
generated
vendored
Normal file
1100
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/binder.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
75
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/fake_binder.go
generated
vendored
Normal file
75
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/fake_binder.go
generated
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumebinding
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
// FakeVolumeBinderConfig holds configurations for fake volume binder.
|
||||
type FakeVolumeBinderConfig struct {
|
||||
AllBound bool
|
||||
FindReasons ConflictReasons
|
||||
FindErr error
|
||||
AssumeErr error
|
||||
BindErr error
|
||||
}
|
||||
|
||||
// NewFakeVolumeBinder sets up all the caches needed for the scheduler to make
|
||||
// topology-aware volume binding decisions.
|
||||
func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder {
|
||||
return &FakeVolumeBinder{
|
||||
config: config,
|
||||
}
|
||||
}
|
||||
|
||||
// FakeVolumeBinder represents a fake volume binder for testing.
|
||||
type FakeVolumeBinder struct {
|
||||
config *FakeVolumeBinderConfig
|
||||
AssumeCalled bool
|
||||
BindCalled bool
|
||||
}
|
||||
|
||||
var _ SchedulerVolumeBinder = &FakeVolumeBinder{}
|
||||
|
||||
// GetPodVolumeClaims implements SchedulerVolumeBinder.GetPodVolumes.
|
||||
func (b *FakeVolumeBinder) GetPodVolumeClaims(_ klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
|
||||
return &PodVolumeClaims{}, nil
|
||||
}
|
||||
|
||||
// FindPodVolumes implements SchedulerVolumeBinder.FindPodVolumes.
|
||||
func (b *FakeVolumeBinder) FindPodVolumes(_ klog.Logger, pod *v1.Pod, _ *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
|
||||
return nil, b.config.FindReasons, b.config.FindErr
|
||||
}
|
||||
|
||||
// AssumePodVolumes implements SchedulerVolumeBinder.AssumePodVolumes.
|
||||
func (b *FakeVolumeBinder) AssumePodVolumes(_ klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (bool, error) {
|
||||
b.AssumeCalled = true
|
||||
return b.config.AllBound, b.config.AssumeErr
|
||||
}
|
||||
|
||||
// RevertAssumedPodVolumes implements SchedulerVolumeBinder.RevertAssumedPodVolumes
|
||||
func (b *FakeVolumeBinder) RevertAssumedPodVolumes(_ *PodVolumes) {}
|
||||
|
||||
// BindPodVolumes implements SchedulerVolumeBinder.BindPodVolumes.
|
||||
func (b *FakeVolumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error {
|
||||
b.BindCalled = true
|
||||
return b.config.BindErr
|
||||
}
|
55
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics/metrics.go
generated
vendored
Normal file
55
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics/metrics.go
generated
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
/*
|
||||
Copyright 2018 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"k8s.io/component-base/metrics"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
)
|
||||
|
||||
// VolumeSchedulerSubsystem - subsystem name used by scheduler
|
||||
const VolumeSchedulerSubsystem = "scheduler_volume"
|
||||
|
||||
var (
|
||||
// VolumeBindingRequestSchedulerBinderCache tracks the number of volume binder cache operations.
|
||||
VolumeBindingRequestSchedulerBinderCache = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: VolumeSchedulerSubsystem,
|
||||
Name: "binder_cache_requests_total",
|
||||
Help: "Total number for request volume binding cache",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"operation"},
|
||||
)
|
||||
// VolumeSchedulingStageFailed tracks the number of failed volume scheduling operations.
|
||||
VolumeSchedulingStageFailed = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: VolumeSchedulerSubsystem,
|
||||
Name: "scheduling_stage_error_total",
|
||||
Help: "Volume scheduling stage error count",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"operation"},
|
||||
)
|
||||
)
|
||||
|
||||
// RegisterVolumeSchedulingMetrics is used for scheduler, because the volume binding cache is a library
|
||||
// used by scheduler process.
|
||||
func RegisterVolumeSchedulingMetrics() {
|
||||
legacyregistry.MustRegister(VolumeBindingRequestSchedulerBinderCache)
|
||||
legacyregistry.MustRegister(VolumeSchedulingStageFailed)
|
||||
}
|
54
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/scorer.go
generated
vendored
Normal file
54
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/scorer.go
generated
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumebinding
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
)
|
||||
|
||||
// classResourceMap holds a map of storage class to resource.
|
||||
type classResourceMap map[string]*StorageResource
|
||||
|
||||
// volumeCapacityScorer calculates the score based on class storage resource information.
|
||||
type volumeCapacityScorer func(classResourceMap) int64
|
||||
|
||||
// buildScorerFunction builds volumeCapacityScorer from the scoring function shape.
|
||||
func buildScorerFunction(scoringFunctionShape helper.FunctionShape) volumeCapacityScorer {
|
||||
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
|
||||
f := func(requested, capacity int64) int64 {
|
||||
if capacity == 0 || requested > capacity {
|
||||
return rawScoringFunction(maxUtilization)
|
||||
}
|
||||
|
||||
return rawScoringFunction(requested * maxUtilization / capacity)
|
||||
}
|
||||
return func(classResources classResourceMap) int64 {
|
||||
var nodeScore int64
|
||||
// in alpha stage, all classes have the same weight
|
||||
weightSum := len(classResources)
|
||||
if weightSum == 0 {
|
||||
return 0
|
||||
}
|
||||
for _, resource := range classResources {
|
||||
classScore := f(resource.Requested, resource.Capacity)
|
||||
nodeScore += classScore
|
||||
}
|
||||
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
|
||||
}
|
||||
}
|
217
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/test_utils.go
generated
vendored
Normal file
217
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/test_utils.go
generated
vendored
Normal file
@ -0,0 +1,217 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumebinding
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/component-helpers/storage/volume"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
type nodeBuilder struct {
|
||||
*v1.Node
|
||||
}
|
||||
|
||||
func makeNode(name string) nodeBuilder {
|
||||
return nodeBuilder{Node: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Labels: map[string]string{
|
||||
v1.LabelHostname: name,
|
||||
},
|
||||
},
|
||||
}}
|
||||
}
|
||||
|
||||
func (nb nodeBuilder) withLabel(key, value string) nodeBuilder {
|
||||
if nb.Node.ObjectMeta.Labels == nil {
|
||||
nb.Node.ObjectMeta.Labels = map[string]string{}
|
||||
}
|
||||
nb.Node.ObjectMeta.Labels[key] = value
|
||||
return nb
|
||||
}
|
||||
|
||||
type pvBuilder struct {
|
||||
*v1.PersistentVolume
|
||||
}
|
||||
|
||||
func makePV(name, className string) pvBuilder {
|
||||
return pvBuilder{PersistentVolume: &v1.PersistentVolume{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
},
|
||||
Spec: v1.PersistentVolumeSpec{
|
||||
StorageClassName: className,
|
||||
},
|
||||
}}
|
||||
}
|
||||
|
||||
func (pvb pvBuilder) withNodeAffinity(keyValues map[string][]string) pvBuilder {
|
||||
matchExpressions := make([]v1.NodeSelectorRequirement, 0)
|
||||
for key, values := range keyValues {
|
||||
matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{
|
||||
Key: key,
|
||||
Operator: v1.NodeSelectorOpIn,
|
||||
Values: values,
|
||||
})
|
||||
}
|
||||
pvb.PersistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
|
||||
Required: &v1.NodeSelector{
|
||||
NodeSelectorTerms: []v1.NodeSelectorTerm{
|
||||
{
|
||||
MatchExpressions: matchExpressions,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
return pvb
|
||||
}
|
||||
|
||||
func (pvb pvBuilder) withVersion(version string) pvBuilder {
|
||||
pvb.PersistentVolume.ObjectMeta.ResourceVersion = version
|
||||
return pvb
|
||||
}
|
||||
|
||||
func (pvb pvBuilder) withCapacity(capacity resource.Quantity) pvBuilder {
|
||||
pvb.PersistentVolume.Spec.Capacity = v1.ResourceList{
|
||||
v1.ResourceName(v1.ResourceStorage): capacity,
|
||||
}
|
||||
return pvb
|
||||
}
|
||||
|
||||
func (pvb pvBuilder) withPhase(phase v1.PersistentVolumePhase) pvBuilder {
|
||||
pvb.PersistentVolume.Status = v1.PersistentVolumeStatus{
|
||||
Phase: phase,
|
||||
}
|
||||
return pvb
|
||||
}
|
||||
|
||||
type pvcBuilder struct {
|
||||
*v1.PersistentVolumeClaim
|
||||
}
|
||||
|
||||
func makePVC(name string, storageClassName string) pvcBuilder {
|
||||
return pvcBuilder{PersistentVolumeClaim: &v1.PersistentVolumeClaim{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Namespace: v1.NamespaceDefault,
|
||||
},
|
||||
Spec: v1.PersistentVolumeClaimSpec{
|
||||
StorageClassName: ptr.To(storageClassName),
|
||||
},
|
||||
}}
|
||||
}
|
||||
|
||||
func (pvcb pvcBuilder) withBoundPV(pvName string) pvcBuilder {
|
||||
pvcb.PersistentVolumeClaim.Spec.VolumeName = pvName
|
||||
metav1.SetMetaDataAnnotation(&pvcb.PersistentVolumeClaim.ObjectMeta, volume.AnnBindCompleted, "true")
|
||||
return pvcb
|
||||
}
|
||||
|
||||
func (pvcb pvcBuilder) withRequestStorage(request resource.Quantity) pvcBuilder {
|
||||
pvcb.PersistentVolumeClaim.Spec.Resources = v1.VolumeResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
v1.ResourceName(v1.ResourceStorage): request,
|
||||
},
|
||||
}
|
||||
return pvcb
|
||||
}
|
||||
|
||||
func (pvcb pvcBuilder) withPhase(phase v1.PersistentVolumeClaimPhase) pvcBuilder {
|
||||
pvcb.PersistentVolumeClaim.Status = v1.PersistentVolumeClaimStatus{
|
||||
Phase: phase,
|
||||
}
|
||||
return pvcb
|
||||
}
|
||||
|
||||
type podBuilder struct {
|
||||
*v1.Pod
|
||||
}
|
||||
|
||||
func makePod(name string) podBuilder {
|
||||
pb := podBuilder{Pod: &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Namespace: v1.NamespaceDefault,
|
||||
},
|
||||
}}
|
||||
pb.Pod.Spec.Volumes = make([]v1.Volume, 0)
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withNodeName(name string) podBuilder {
|
||||
pb.Pod.Spec.NodeName = name
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withNamespace(name string) podBuilder {
|
||||
pb.Pod.ObjectMeta.Namespace = name
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withPVCVolume(pvcName, name string) podBuilder {
|
||||
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
|
||||
Name: name,
|
||||
VolumeSource: v1.VolumeSource{
|
||||
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
|
||||
ClaimName: pvcName,
|
||||
},
|
||||
},
|
||||
})
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withPVCSVolume(pvcs []*v1.PersistentVolumeClaim) podBuilder {
|
||||
for i, pvc := range pvcs {
|
||||
pb.withPVCVolume(pvc.Name, fmt.Sprintf("vol%v", i))
|
||||
}
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withEmptyDirVolume() podBuilder {
|
||||
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
|
||||
VolumeSource: v1.VolumeSource{
|
||||
EmptyDir: &v1.EmptyDirVolumeSource{},
|
||||
},
|
||||
})
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withGenericEphemeralVolume(name string) podBuilder {
|
||||
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
|
||||
Name: name,
|
||||
VolumeSource: v1.VolumeSource{
|
||||
Ephemeral: &v1.EphemeralVolumeSource{},
|
||||
},
|
||||
})
|
||||
return pb
|
||||
}
|
||||
|
||||
func (pb podBuilder) withCSI(driver string) podBuilder {
|
||||
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
|
||||
VolumeSource: v1.VolumeSource{
|
||||
CSI: &v1.CSIVolumeSource{
|
||||
Driver: driver,
|
||||
},
|
||||
},
|
||||
})
|
||||
return pb
|
||||
}
|
602
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/volume_binding.go
generated
vendored
Normal file
602
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/volume_binding.go
generated
vendored
Normal file
@ -0,0 +1,602 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumebinding
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
apiequality "k8s.io/apimachinery/pkg/api/equality"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/component-helpers/storage/ephemeral"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
const (
|
||||
stateKey framework.StateKey = Name
|
||||
|
||||
maxUtilization = 100
|
||||
)
|
||||
|
||||
// the state is initialized in PreFilter phase. because we save the pointer in
|
||||
// framework.CycleState, in the later phases we don't need to call Write method
|
||||
// to update the value
|
||||
type stateData struct {
|
||||
allBound bool
|
||||
// podVolumesByNode holds the pod's volume information found in the Filter
|
||||
// phase for each node
|
||||
// it's initialized in the PreFilter phase
|
||||
podVolumesByNode map[string]*PodVolumes
|
||||
podVolumeClaims *PodVolumeClaims
|
||||
// hasStaticBindings declares whether the pod contains one or more StaticBinding.
|
||||
// If not, vloumeBinding will skip score extension point.
|
||||
hasStaticBindings bool
|
||||
sync.Mutex
|
||||
}
|
||||
|
||||
func (d *stateData) Clone() framework.StateData {
|
||||
return d
|
||||
}
|
||||
|
||||
// VolumeBinding is a plugin that binds pod volumes in scheduling.
|
||||
// In the Filter phase, pod binding cache is created for the pod and used in
|
||||
// Reserve and PreBind phases.
|
||||
type VolumeBinding struct {
|
||||
Binder SchedulerVolumeBinder
|
||||
PVCLister corelisters.PersistentVolumeClaimLister
|
||||
scorer volumeCapacityScorer
|
||||
fts feature.Features
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &VolumeBinding{}
|
||||
var _ framework.FilterPlugin = &VolumeBinding{}
|
||||
var _ framework.ReservePlugin = &VolumeBinding{}
|
||||
var _ framework.PreBindPlugin = &VolumeBinding{}
|
||||
var _ framework.PreScorePlugin = &VolumeBinding{}
|
||||
var _ framework.ScorePlugin = &VolumeBinding{}
|
||||
var _ framework.EnqueueExtensions = &VolumeBinding{}
|
||||
|
||||
// Name is the name of the plugin used in Registry and configurations.
|
||||
const Name = names.VolumeBinding
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *VolumeBinding) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *VolumeBinding) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// Pods may fail to find available PVs because the node labels do not
|
||||
// match the storage class's allowed topologies or PV's node affinity.
|
||||
// A new or updated node may make pods schedulable.
|
||||
//
|
||||
// A note about UpdateNodeTaint event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
|
||||
if pl.fts.EnableSchedulingQueueHint {
|
||||
// When scheduling queue hint is enabled, we don't use the problematic preCheck and don't need to register UpdateNodeTaint event.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeLabel
|
||||
}
|
||||
events := []framework.ClusterEventWithHint{
|
||||
// Pods may fail because of missing or mis-configured storage class
|
||||
// (e.g., allowedTopologies, volumeBindingMode), and hence may become
|
||||
// schedulable upon StorageClass Add or Update events.
|
||||
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterStorageClassChange},
|
||||
|
||||
// We bind PVCs with PVs, so any changes may make the pods schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}},
|
||||
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
|
||||
// We rely on CSI node to translate in-tree PV to CSI.
|
||||
// TODO: kube-schduler will unregister the CSINode events once all the volume plugins has completed their CSI migration.
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSINodeChange},
|
||||
|
||||
// When CSIStorageCapacity is enabled, pods may become schedulable
|
||||
// on CSI driver & storage capacity changes.
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSIDriver, ActionType: framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIDriverChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSIStorageCapacity, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIStorageCapacityChange},
|
||||
}
|
||||
return events, nil
|
||||
}
|
||||
|
||||
func (pl *VolumeBinding) isSchedulableAfterCSINodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
if oldObj == nil {
|
||||
logger.V(5).Info("CSINode creation could make the pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
oldCSINode, modifiedCSINode, err := util.As[*storagev1.CSINode](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
logger = klog.LoggerWithValues(
|
||||
logger,
|
||||
"Pod", klog.KObj(pod),
|
||||
"CSINode", klog.KObj(modifiedCSINode),
|
||||
)
|
||||
|
||||
if oldCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] != modifiedCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] {
|
||||
logger.V(5).Info("CSINode's migrated plugins annotation is updated and that may make the pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("CISNode was created or updated but it doesn't make this pod schedulable")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
func (pl *VolumeBinding) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, newPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
logger = klog.LoggerWithValues(
|
||||
logger,
|
||||
"Pod", klog.KObj(pod),
|
||||
"PersistentVolumeClaim", klog.KObj(newPVC),
|
||||
)
|
||||
|
||||
if pod.Namespace != newPVC.Namespace {
|
||||
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable because the PVC belongs to a different namespace")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
for _, vol := range pod.Spec.Volumes {
|
||||
var pvcName string
|
||||
switch {
|
||||
case vol.PersistentVolumeClaim != nil:
|
||||
pvcName = vol.PersistentVolumeClaim.ClaimName
|
||||
case vol.Ephemeral != nil:
|
||||
pvcName = ephemeral.VolumeClaimName(pod, &vol)
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
||||
if pvcName == newPVC.Name {
|
||||
// Return Queue because, in this case,
|
||||
// all PVC creations and almost all PVC updates could make the Pod schedulable.
|
||||
logger.V(5).Info("PersistentVolumeClaim the pod requires was created or updated, potentially making the target Pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterStorageClassChange checks whether an StorageClass event might make a Pod schedulable or not.
|
||||
// Any StorageClass addition and a StorageClass update to allowedTopologies
|
||||
// might make a Pod schedulable.
|
||||
// Note that an update to volume binding mode is not allowed and we don't have to consider while examining the update event.
|
||||
func (pl *VolumeBinding) isSchedulableAfterStorageClassChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
oldSC, newSC, err := util.As[*storagev1.StorageClass](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
logger = klog.LoggerWithValues(
|
||||
logger,
|
||||
"Pod", klog.KObj(pod),
|
||||
"StorageClass", klog.KObj(newSC),
|
||||
)
|
||||
|
||||
if oldSC == nil {
|
||||
// No further filtering can be made for a creation event,
|
||||
// and we just always return Queue.
|
||||
logger.V(5).Info("A new StorageClass was created, which could make a Pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if !apiequality.Semantic.DeepEqual(newSC.AllowedTopologies, oldSC.AllowedTopologies) {
|
||||
logger.V(5).Info("StorageClass got an update in AllowedTopologies", "AllowedTopologies", newSC.AllowedTopologies)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("StorageClass was updated, but it doesn't make this pod schedulable")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterCSIStorageCapacityChange checks whether a CSIStorageCapacity event
|
||||
// might make a Pod schedulable or not.
|
||||
// Any CSIStorageCapacity addition and a CSIStorageCapacity update to volume limit
|
||||
// (calculated based on capacity and maximumVolumeSize) might make a Pod schedulable.
|
||||
// Note that an update to nodeTopology and storageClassName is not allowed and
|
||||
// we don't have to consider while examining the update event.
|
||||
func (pl *VolumeBinding) isSchedulableAfterCSIStorageCapacityChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
oldCap, newCap, err := util.As[*storagev1.CSIStorageCapacity](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
if oldCap == nil {
|
||||
logger.V(5).Info(
|
||||
"A new CSIStorageCapacity was created, which could make a Pod schedulable",
|
||||
"Pod", klog.KObj(pod),
|
||||
"CSIStorageCapacity", klog.KObj(newCap),
|
||||
)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
oldLimit := volumeLimit(oldCap)
|
||||
newLimit := volumeLimit(newCap)
|
||||
|
||||
logger = klog.LoggerWithValues(
|
||||
logger,
|
||||
"Pod", klog.KObj(pod),
|
||||
"CSIStorageCapacity", klog.KObj(newCap),
|
||||
"volumeLimit(new)", newLimit,
|
||||
"volumeLimit(old)", oldLimit,
|
||||
)
|
||||
|
||||
if newLimit != nil && (oldLimit == nil || newLimit.Value() > oldLimit.Value()) {
|
||||
logger.V(5).Info("VolumeLimit was increased, which could make a Pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("CSIStorageCapacity was updated, but it doesn't make this pod schedulable")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
func (pl *VolumeBinding) isSchedulableAfterCSIDriverChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalCSIDriver, modifiedCSIDriver, err := util.As[*storagev1.CSIDriver](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
logger = klog.LoggerWithValues(
|
||||
logger,
|
||||
"Pod", klog.KObj(pod),
|
||||
"CSIDriver", klog.KObj(modifiedCSIDriver),
|
||||
)
|
||||
|
||||
for _, vol := range pod.Spec.Volumes {
|
||||
if vol.CSI == nil || vol.CSI.Driver != modifiedCSIDriver.Name {
|
||||
continue
|
||||
}
|
||||
if (originalCSIDriver.Spec.StorageCapacity != nil && *originalCSIDriver.Spec.StorageCapacity) &&
|
||||
(modifiedCSIDriver.Spec.StorageCapacity == nil || !*modifiedCSIDriver.Spec.StorageCapacity) {
|
||||
logger.V(5).Info("CSIDriver was updated and storage capacity got disabled, which may make the pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("CSIDriver was created or updated but it doesn't make this pod schedulable")
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// podHasPVCs returns 2 values:
|
||||
// - the first one to denote if the given "pod" has any PVC defined.
|
||||
// - the second one to return any error if the requested PVC is illegal.
|
||||
func (pl *VolumeBinding) podHasPVCs(pod *v1.Pod) (bool, error) {
|
||||
hasPVC := false
|
||||
for _, vol := range pod.Spec.Volumes {
|
||||
var pvcName string
|
||||
isEphemeral := false
|
||||
switch {
|
||||
case vol.PersistentVolumeClaim != nil:
|
||||
pvcName = vol.PersistentVolumeClaim.ClaimName
|
||||
case vol.Ephemeral != nil:
|
||||
pvcName = ephemeral.VolumeClaimName(pod, &vol)
|
||||
isEphemeral = true
|
||||
default:
|
||||
// Volume is not using a PVC, ignore
|
||||
continue
|
||||
}
|
||||
hasPVC = true
|
||||
pvc, err := pl.PVCLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
|
||||
if err != nil {
|
||||
// The error usually has already enough context ("persistentvolumeclaim "myclaim" not found"),
|
||||
// but we can do better for generic ephemeral inline volumes where that situation
|
||||
// is normal directly after creating a pod.
|
||||
if isEphemeral && apierrors.IsNotFound(err) {
|
||||
err = fmt.Errorf("waiting for ephemeral volume controller to create the persistentvolumeclaim %q", pvcName)
|
||||
}
|
||||
return hasPVC, err
|
||||
}
|
||||
|
||||
if pvc.Status.Phase == v1.ClaimLost {
|
||||
return hasPVC, fmt.Errorf("persistentvolumeclaim %q bound to non-existent persistentvolume %q", pvc.Name, pvc.Spec.VolumeName)
|
||||
}
|
||||
|
||||
if pvc.DeletionTimestamp != nil {
|
||||
return hasPVC, fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name)
|
||||
}
|
||||
|
||||
if isEphemeral {
|
||||
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
|
||||
return hasPVC, err
|
||||
}
|
||||
}
|
||||
}
|
||||
return hasPVC, nil
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point to check if pod has all
|
||||
// immediate PVCs bound. If not all immediate PVCs are bound, an
|
||||
// UnschedulableAndUnresolvable is returned.
|
||||
func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
logger := klog.FromContext(ctx)
|
||||
// If pod does not reference any PVC, we don't need to do anything.
|
||||
if hasPVC, err := pl.podHasPVCs(pod); err != nil {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
|
||||
} else if !hasPVC {
|
||||
state.Write(stateKey, &stateData{})
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
podVolumeClaims, err := pl.Binder.GetPodVolumeClaims(logger, pod)
|
||||
if err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
if len(podVolumeClaims.unboundClaimsImmediate) > 0 {
|
||||
// Return UnschedulableAndUnresolvable error if immediate claims are
|
||||
// not bound. Pod will be moved to active/backoff queues once these
|
||||
// claims are bound by PV controller.
|
||||
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
|
||||
status.AppendReason("pod has unbound immediate PersistentVolumeClaims")
|
||||
return nil, status
|
||||
}
|
||||
state.Write(stateKey, &stateData{
|
||||
podVolumesByNode: make(map[string]*PodVolumes),
|
||||
podVolumeClaims: &PodVolumeClaims{
|
||||
boundClaims: podVolumeClaims.boundClaims,
|
||||
unboundClaimsDelayBinding: podVolumeClaims.unboundClaimsDelayBinding,
|
||||
unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding,
|
||||
},
|
||||
})
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *VolumeBinding) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getStateData(cs *framework.CycleState) (*stateData, error) {
|
||||
state, err := cs.Read(stateKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s, ok := state.(*stateData)
|
||||
if !ok {
|
||||
return nil, errors.New("unable to convert state into stateData")
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
// It evaluates if a pod can fit due to the volumes it requests,
|
||||
// for both bound and unbound PVCs.
|
||||
//
|
||||
// For PVCs that are bound, then it checks that the corresponding PV's node affinity is
|
||||
// satisfied by the given node.
|
||||
//
|
||||
// For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements
|
||||
// and that the PV node affinity is satisfied by the given node.
|
||||
//
|
||||
// If storage capacity tracking is enabled, then enough space has to be available
|
||||
// for the node and volumes that still need to be created.
|
||||
//
|
||||
// The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound
|
||||
// PVCs can be matched with an available and node-compatible PV.
|
||||
func (pl *VolumeBinding) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
logger := klog.FromContext(ctx)
|
||||
node := nodeInfo.Node()
|
||||
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
podVolumes, reasons, err := pl.Binder.FindPodVolumes(logger, pod, state.podVolumeClaims, node)
|
||||
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
if len(reasons) > 0 {
|
||||
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
|
||||
for _, reason := range reasons {
|
||||
status.AppendReason(string(reason))
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
// multiple goroutines call `Filter` on different nodes simultaneously and the `CycleState` may be duplicated, so we must use a local lock here
|
||||
state.Lock()
|
||||
state.podVolumesByNode[node.Name] = podVolumes
|
||||
state.hasStaticBindings = state.hasStaticBindings || (podVolumes != nil && len(podVolumes.StaticBindings) > 0)
|
||||
state.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreScore invoked at the preScore extension point. It checks whether volumeBinding can skip Score
|
||||
func (pl *VolumeBinding) PreScore(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
if pl.scorer == nil {
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if state.hasStaticBindings {
|
||||
return nil
|
||||
}
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
if pl.scorer == nil {
|
||||
return 0, nil
|
||||
}
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
podVolumes, ok := state.podVolumesByNode[nodeName]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
}
|
||||
// group by storage class
|
||||
classResources := make(classResourceMap)
|
||||
for _, staticBinding := range podVolumes.StaticBindings {
|
||||
class := staticBinding.StorageClassName()
|
||||
storageResource := staticBinding.StorageResource()
|
||||
if _, ok := classResources[class]; !ok {
|
||||
classResources[class] = &StorageResource{
|
||||
Requested: 0,
|
||||
Capacity: 0,
|
||||
}
|
||||
}
|
||||
classResources[class].Requested += storageResource.Requested
|
||||
classResources[class].Capacity += storageResource.Capacity
|
||||
}
|
||||
return pl.scorer(classResources), nil
|
||||
}
|
||||
|
||||
// ScoreExtensions of the Score plugin.
|
||||
func (pl *VolumeBinding) ScoreExtensions() framework.ScoreExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Reserve reserves volumes of pod and saves binding status in cycle state.
|
||||
func (pl *VolumeBinding) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
// we don't need to hold the lock as only one node will be reserved for the given pod
|
||||
podVolumes, ok := state.podVolumesByNode[nodeName]
|
||||
if ok {
|
||||
allBound, err := pl.Binder.AssumePodVolumes(klog.FromContext(ctx), pod, nodeName, podVolumes)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
state.allBound = allBound
|
||||
} else {
|
||||
// may not exist if the pod does not reference any PVC
|
||||
state.allBound = true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreBind will make the API update with the assumed bindings and wait until
|
||||
// the PV controller has completely finished the binding operation.
|
||||
//
|
||||
// If binding errors, times out or gets undone, then an error will be returned to
|
||||
// retry scheduling.
|
||||
func (pl *VolumeBinding) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
|
||||
s, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if s.allBound {
|
||||
// no need to bind volumes
|
||||
return nil
|
||||
}
|
||||
// we don't need to hold the lock as only one node will be pre-bound for the given pod
|
||||
podVolumes, ok := s.podVolumesByNode[nodeName]
|
||||
if !ok {
|
||||
return framework.AsStatus(fmt.Errorf("no pod volumes found for node %q", nodeName))
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
logger.V(5).Info("Trying to bind volumes for pod", "pod", klog.KObj(pod))
|
||||
err = pl.Binder.BindPodVolumes(ctx, pod, podVolumes)
|
||||
if err != nil {
|
||||
logger.V(5).Info("Failed to bind volumes for pod", "pod", klog.KObj(pod), "err", err)
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
logger.V(5).Info("Success binding volumes for pod", "pod", klog.KObj(pod))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unreserve clears assumed PV and PVC cache.
|
||||
// It's idempotent, and does nothing if no cache found for the given pod.
|
||||
func (pl *VolumeBinding) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
|
||||
s, err := getStateData(cs)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// we don't need to hold the lock as only one node may be unreserved
|
||||
podVolumes, ok := s.podVolumesByNode[nodeName]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
pl.Binder.RevertAssumedPodVolumes(podVolumes)
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
args, ok := plArgs.(*config.VolumeBindingArgs)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
|
||||
}
|
||||
if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
|
||||
AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
podInformer := fh.SharedInformerFactory().Core().V1().Pods()
|
||||
nodeInformer := fh.SharedInformerFactory().Core().V1().Nodes()
|
||||
pvcInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumeClaims()
|
||||
pvInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumes()
|
||||
storageClassInformer := fh.SharedInformerFactory().Storage().V1().StorageClasses()
|
||||
csiNodeInformer := fh.SharedInformerFactory().Storage().V1().CSINodes()
|
||||
capacityCheck := CapacityCheck{
|
||||
CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
|
||||
CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1().CSIStorageCapacities(),
|
||||
}
|
||||
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
|
||||
|
||||
// build score function
|
||||
var scorer volumeCapacityScorer
|
||||
if fts.EnableVolumeCapacityPriority {
|
||||
shape := make(helper.FunctionShape, 0, len(args.Shape))
|
||||
for _, point := range args.Shape {
|
||||
shape = append(shape, helper.FunctionShapePoint{
|
||||
Utilization: int64(point.Utilization),
|
||||
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
|
||||
})
|
||||
}
|
||||
scorer = buildScorerFunction(shape)
|
||||
}
|
||||
return &VolumeBinding{
|
||||
Binder: binder,
|
||||
PVCLister: pvcInformer.Lister(),
|
||||
scorer: scorer,
|
||||
fts: fts,
|
||||
}, nil
|
||||
}
|
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions/OWNERS
generated
vendored
Normal file
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions/OWNERS
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
approvers:
|
||||
- sig-storage-approvers
|
||||
- cofyc
|
||||
reviewers:
|
||||
- sig-storage-reviewers
|
||||
- cofyc
|
||||
labels:
|
||||
- sig/storage
|
426
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions/volume_restrictions.go
generated
vendored
Normal file
426
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions/volume_restrictions.go
generated
vendored
Normal file
@ -0,0 +1,426 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumerestrictions
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// VolumeRestrictions is a plugin that checks volume restrictions.
|
||||
type VolumeRestrictions struct {
|
||||
pvcLister corelisters.PersistentVolumeClaimLister
|
||||
sharedLister framework.SharedLister
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &VolumeRestrictions{}
|
||||
var _ framework.FilterPlugin = &VolumeRestrictions{}
|
||||
var _ framework.EnqueueExtensions = &VolumeRestrictions{}
|
||||
var _ framework.StateData = &preFilterState{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.VolumeRestrictions
|
||||
// preFilterStateKey is the key in CycleState to VolumeRestrictions pre-computed data for Filtering.
|
||||
// Using the name of the plugin will likely help us avoid collisions with other plugins.
|
||||
preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// ErrReasonDiskConflict is used for NoDiskConflict predicate error.
|
||||
ErrReasonDiskConflict = "node(s) had no available disk"
|
||||
// ErrReasonReadWriteOncePodConflict is used when a pod is found using the same PVC with the ReadWriteOncePod access mode.
|
||||
ErrReasonReadWriteOncePodConflict = "node has pod using PersistentVolumeClaim with the same name and ReadWriteOncePod access mode"
|
||||
)
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
type preFilterState struct {
|
||||
// Names of the pod's volumes using the ReadWriteOncePod access mode.
|
||||
readWriteOncePodPVCs sets.Set[string]
|
||||
// The number of references to these ReadWriteOncePod volumes by scheduled pods.
|
||||
conflictingPVCRefCount int
|
||||
}
|
||||
|
||||
func (s *preFilterState) updateWithPod(podInfo *framework.PodInfo, multiplier int) {
|
||||
s.conflictingPVCRefCount += multiplier * s.conflictingPVCRefCountForPod(podInfo)
|
||||
}
|
||||
|
||||
func (s *preFilterState) conflictingPVCRefCountForPod(podInfo *framework.PodInfo) int {
|
||||
conflicts := 0
|
||||
for _, volume := range podInfo.Pod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim == nil {
|
||||
continue
|
||||
}
|
||||
if s.readWriteOncePodPVCs.Has(volume.PersistentVolumeClaim.ClaimName) {
|
||||
conflicts += 1
|
||||
}
|
||||
}
|
||||
return conflicts
|
||||
}
|
||||
|
||||
// Clone the prefilter state.
|
||||
func (s *preFilterState) Clone() framework.StateData {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
return &preFilterState{
|
||||
readWriteOncePodPVCs: s.readWriteOncePodPVCs,
|
||||
conflictingPVCRefCount: s.conflictingPVCRefCount,
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *VolumeRestrictions) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
func isVolumeConflict(volume *v1.Volume, pod *v1.Pod) bool {
|
||||
for _, existingVolume := range pod.Spec.Volumes {
|
||||
// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
|
||||
if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
|
||||
disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
|
||||
if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
|
||||
if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if volume.ISCSI != nil && existingVolume.ISCSI != nil {
|
||||
iqn := volume.ISCSI.IQN
|
||||
eiqn := existingVolume.ISCSI.IQN
|
||||
// two ISCSI volumes are same, if they share the same iqn. As iscsi volumes are of type
|
||||
// RWO or ROX, we could permit only one RW mount. Same iscsi volume mounted by multiple Pods
|
||||
// conflict unless all other pods mount as read only.
|
||||
if iqn == eiqn && !(volume.ISCSI.ReadOnly && existingVolume.ISCSI.ReadOnly) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if volume.RBD != nil && existingVolume.RBD != nil {
|
||||
mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
|
||||
emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
|
||||
// two RBDs images are the same if they share the same Ceph monitor, are in the same RADOS Pool, and have the same image name
|
||||
// only one read-write mount is permitted for the same RBD image.
|
||||
// same RBD image mounted by multiple Pods conflicts unless all Pods mount the image read-only
|
||||
if haveOverlap(mon, emon) && pool == epool && image == eimage && !(volume.RBD.ReadOnly && existingVolume.RBD.ReadOnly) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// haveOverlap searches two arrays and returns true if they have at least one common element; returns false otherwise.
|
||||
func haveOverlap(a1, a2 []string) bool {
|
||||
if len(a1) > len(a2) {
|
||||
a1, a2 = a2, a1
|
||||
}
|
||||
m := sets.New(a1...)
|
||||
for _, val := range a2 {
|
||||
if _, ok := m[val]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// return true if there are conflict checking targets.
|
||||
func needsRestrictionsCheck(v v1.Volume) bool {
|
||||
return v.GCEPersistentDisk != nil || v.AWSElasticBlockStore != nil || v.RBD != nil || v.ISCSI != nil
|
||||
}
|
||||
|
||||
// PreFilter computes and stores cycleState containing details for enforcing ReadWriteOncePod.
|
||||
func (pl *VolumeRestrictions) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
needsCheck := false
|
||||
for i := range pod.Spec.Volumes {
|
||||
if needsRestrictionsCheck(pod.Spec.Volumes[i]) {
|
||||
needsCheck = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
pvcs, err := pl.readWriteOncePodPVCsForPod(ctx, pod)
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
|
||||
}
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
s, err := pl.calPreFilterState(ctx, pod, pvcs)
|
||||
if err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
if !needsCheck && s.conflictingPVCRefCount == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
cycleState.Write(preFilterStateKey, s)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// AddPod from pre-computed data in cycleState.
|
||||
func (pl *VolumeRestrictions) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
state.updateWithPod(podInfoToAdd, 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemovePod from pre-computed data in cycleState.
|
||||
func (pl *VolumeRestrictions) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
state.updateWithPod(podInfoToRemove, -1)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
|
||||
c, err := cycleState.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
|
||||
return nil, fmt.Errorf("cannot read %q from cycleState", preFilterStateKey)
|
||||
}
|
||||
|
||||
s, ok := c.(*preFilterState)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%+v convert to volumerestrictions.state error", c)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// calPreFilterState computes preFilterState describing which PVCs use ReadWriteOncePod
|
||||
// and which pods in the cluster are in conflict.
|
||||
func (pl *VolumeRestrictions) calPreFilterState(ctx context.Context, pod *v1.Pod, pvcs sets.Set[string]) (*preFilterState, error) {
|
||||
conflictingPVCRefCount := 0
|
||||
for pvc := range pvcs {
|
||||
key := framework.GetNamespacedName(pod.Namespace, pvc)
|
||||
if pl.sharedLister.StorageInfos().IsPVCUsedByPods(key) {
|
||||
// There can only be at most one pod using the ReadWriteOncePod PVC.
|
||||
conflictingPVCRefCount += 1
|
||||
}
|
||||
}
|
||||
return &preFilterState{
|
||||
readWriteOncePodPVCs: pvcs,
|
||||
conflictingPVCRefCount: conflictingPVCRefCount,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (pl *VolumeRestrictions) readWriteOncePodPVCsForPod(ctx context.Context, pod *v1.Pod) (sets.Set[string], error) {
|
||||
pvcs := sets.New[string]()
|
||||
for _, volume := range pod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !v1helper.ContainsAccessMode(pvc.Spec.AccessModes, v1.ReadWriteOncePod) {
|
||||
continue
|
||||
}
|
||||
pvcs.Insert(pvc.Name)
|
||||
}
|
||||
return pvcs, nil
|
||||
}
|
||||
|
||||
// Checks if scheduling the pod onto this node would cause any conflicts with
|
||||
// existing volumes.
|
||||
func satisfyVolumeConflicts(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
|
||||
for i := range pod.Spec.Volumes {
|
||||
v := pod.Spec.Volumes[i]
|
||||
if !needsRestrictionsCheck(v) {
|
||||
continue
|
||||
}
|
||||
for _, ev := range nodeInfo.Pods {
|
||||
if isVolumeConflict(&v, ev.Pod) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Checks if scheduling the pod would cause any ReadWriteOncePod PVC access mode conflicts.
|
||||
func satisfyReadWriteOncePod(ctx context.Context, state *preFilterState) *framework.Status {
|
||||
if state == nil {
|
||||
return nil
|
||||
}
|
||||
if state.conflictingPVCRefCount > 0 {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonReadWriteOncePodConflict)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *VolumeRestrictions) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return pl
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
// It evaluates if a pod can fit due to the volumes it requests, and those that
|
||||
// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
|
||||
// can't be scheduled there.
|
||||
// This is GCE, Amazon EBS, ISCSI and Ceph RBD specific for now:
|
||||
// - GCE PD allows multiple mounts as long as they're all read-only
|
||||
// - AWS EBS forbids any two pods mounting the same volume ID
|
||||
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image, and the image is read-only
|
||||
// - ISCSI forbids if any two pods share at least same IQN and ISCSI volume is read-only
|
||||
// If the pod uses PVCs with the ReadWriteOncePod access mode, it evaluates if
|
||||
// these PVCs are already in-use and if preemption will help.
|
||||
func (pl *VolumeRestrictions) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
if !satisfyVolumeConflicts(pod, nodeInfo) {
|
||||
return framework.NewStatus(framework.Unschedulable, ErrReasonDiskConflict)
|
||||
}
|
||||
state, err := getPreFilterState(cycleState)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
return satisfyReadWriteOncePod(ctx, state)
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *VolumeRestrictions) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A note about UpdateNodeTaint/UpdateNodeLabel event:
|
||||
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
|
||||
nodeActionType = framework.Add
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// Pods may fail to schedule because of volumes conflicting with other pods on same node.
|
||||
// Once running pods are deleted and volumes have been released, the unschedulable pod will be schedulable.
|
||||
// Due to immutable fields `spec.volumes`, pod update events are ignored.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
|
||||
// A new Node may make a pod schedulable.
|
||||
// We intentionally don't set QueueingHint since all Node/Add events could make Pods schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
// Pods may fail to schedule because the PVC it uses has not yet been created.
|
||||
// This PVC is required to exist to check its access modes.
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add},
|
||||
QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimAdded},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPersistentVolumeClaimAdded is invoked whenever a PersistentVolumeClaim added or changed, It checks whether
|
||||
// that change made a previously unschedulable pod schedulable.
|
||||
func (pl *VolumeRestrictions) isSchedulableAfterPersistentVolumeClaimAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, newPersistentVolumeClaim, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
|
||||
}
|
||||
|
||||
if newPersistentVolumeClaim.Namespace != pod.Namespace {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
for _, volume := range pod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if volume.PersistentVolumeClaim.ClaimName == newPersistentVolumeClaim.Name {
|
||||
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted,
|
||||
// It checks whether the deleted pod will conflict with volumes of other pods on the same node
|
||||
func (pl *VolumeRestrictions) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
|
||||
}
|
||||
|
||||
if deletedPod.Namespace != pod.Namespace {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
nodeInfo := framework.NewNodeInfo(deletedPod)
|
||||
if !satisfyVolumeConflicts(pod, nodeInfo) {
|
||||
logger.V(5).Info("Pod with the volume that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// Return Queue if a deleted pod uses the same PVC since the pod may be unschedulable due to the ReadWriteOncePod access mode of the PVC.
|
||||
//
|
||||
// For now, we don't actually fetch PVC and check the access mode because that operation could be expensive.
|
||||
// Once the observability around QHint is established,
|
||||
// we may want to do that depending on how much the operation would impact the QHint latency negatively.
|
||||
// https://github.com/kubernetes/kubernetes/issues/124566
|
||||
claims := sets.New[string]()
|
||||
for _, volume := range pod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim != nil {
|
||||
claims.Insert(volume.PersistentVolumeClaim.ClaimName)
|
||||
}
|
||||
}
|
||||
for _, volume := range deletedPod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim != nil && claims.Has(volume.PersistentVolumeClaim.ClaimName) {
|
||||
logger.V(5).Info("Pod with the same PVC that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("An irrelevant Pod was deleted, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
informerFactory := handle.SharedInformerFactory()
|
||||
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
|
||||
sharedLister := handle.SnapshotSharedLister()
|
||||
|
||||
return &VolumeRestrictions{
|
||||
pvcLister: pvcLister,
|
||||
sharedLister: sharedLister,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone/OWNERS
generated
vendored
Normal file
10
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone/OWNERS
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
approvers:
|
||||
- sig-storage-approvers
|
||||
- cofyc
|
||||
reviewers:
|
||||
- sig-storage-reviewers
|
||||
- cofyc
|
||||
labels:
|
||||
- sig/storage
|
410
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone/volume_zone.go
generated
vendored
Normal file
410
vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone/volume_zone.go
generated
vendored
Normal file
@ -0,0 +1,410 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volumezone
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storage "k8s.io/api/storage/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
storagelisters "k8s.io/client-go/listers/storage/v1"
|
||||
volumehelpers "k8s.io/cloud-provider/volume/helpers"
|
||||
storagehelpers "k8s.io/component-helpers/storage/volume"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// VolumeZone is a plugin that checks volume zone.
|
||||
type VolumeZone struct {
|
||||
pvLister corelisters.PersistentVolumeLister
|
||||
pvcLister corelisters.PersistentVolumeClaimLister
|
||||
scLister storagelisters.StorageClassLister
|
||||
enableSchedulingQueueHint bool
|
||||
}
|
||||
|
||||
var _ framework.FilterPlugin = &VolumeZone{}
|
||||
var _ framework.PreFilterPlugin = &VolumeZone{}
|
||||
var _ framework.EnqueueExtensions = &VolumeZone{}
|
||||
|
||||
const (
|
||||
// Name is the name of the plugin used in the plugin registry and configurations.
|
||||
Name = names.VolumeZone
|
||||
|
||||
preFilterStateKey framework.StateKey = "PreFilter" + Name
|
||||
|
||||
// ErrReasonConflict is used for NoVolumeZoneConflict predicate error.
|
||||
ErrReasonConflict = "node(s) had no available volume zone"
|
||||
)
|
||||
|
||||
// pvTopology holds the value of a pv's topologyLabel
|
||||
type pvTopology struct {
|
||||
pvName string
|
||||
key string
|
||||
values sets.Set[string]
|
||||
}
|
||||
|
||||
// the state is initialized in PreFilter phase. because we save the pointer in
|
||||
// framework.CycleState, in the later phases we don't need to call Write method
|
||||
// to update the value
|
||||
type stateData struct {
|
||||
// podPVTopologies holds the pv information we need
|
||||
// it's initialized in the PreFilter phase
|
||||
podPVTopologies []pvTopology
|
||||
}
|
||||
|
||||
func (d *stateData) Clone() framework.StateData {
|
||||
return d
|
||||
}
|
||||
|
||||
var topologyLabels = []string{
|
||||
v1.LabelFailureDomainBetaZone,
|
||||
v1.LabelFailureDomainBetaRegion,
|
||||
v1.LabelTopologyZone,
|
||||
v1.LabelTopologyRegion,
|
||||
}
|
||||
|
||||
func translateToGALabel(label string) string {
|
||||
if label == v1.LabelFailureDomainBetaRegion {
|
||||
return v1.LabelTopologyRegion
|
||||
}
|
||||
if label == v1.LabelFailureDomainBetaZone {
|
||||
return v1.LabelTopologyZone
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
// Name returns name of the plugin. It is used in logs, etc.
|
||||
func (pl *VolumeZone) Name() string {
|
||||
return Name
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point
|
||||
//
|
||||
// # It finds the topology of the PersistentVolumes corresponding to the volumes a pod requests
|
||||
//
|
||||
// Currently, this is only supported with PersistentVolumeClaims,
|
||||
// and only looks for the bound PersistentVolume.
|
||||
func (pl *VolumeZone) PreFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
logger := klog.FromContext(ctx)
|
||||
podPVTopologies, status := pl.getPVbyPod(logger, pod)
|
||||
if !status.IsSuccess() {
|
||||
return nil, status
|
||||
}
|
||||
if len(podPVTopologies) == 0 {
|
||||
return nil, framework.NewStatus(framework.Skip)
|
||||
}
|
||||
cs.Write(preFilterStateKey, &stateData{podPVTopologies: podPVTopologies})
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// getPVbyPod gets PVTopology from pod
|
||||
func (pl *VolumeZone) getPVbyPod(logger klog.Logger, pod *v1.Pod) ([]pvTopology, *framework.Status) {
|
||||
podPVTopologies := make([]pvTopology, 0)
|
||||
|
||||
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
|
||||
for _, pvcName := range pvcNames {
|
||||
if pvcName == "" {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no name")
|
||||
}
|
||||
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
|
||||
if s := getErrorAsStatus(err); !s.IsSuccess() {
|
||||
return nil, s
|
||||
}
|
||||
|
||||
pvName := pvc.Spec.VolumeName
|
||||
if pvName == "" {
|
||||
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
|
||||
if len(scName) == 0 {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no pv name and storageClass name")
|
||||
}
|
||||
|
||||
class, err := pl.scLister.Get(scName)
|
||||
if s := getErrorAsStatus(err); !s.IsSuccess() {
|
||||
return nil, s
|
||||
}
|
||||
if class.VolumeBindingMode == nil {
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("VolumeBindingMode not set for StorageClass %q", scName))
|
||||
}
|
||||
if *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer {
|
||||
// Skip unbound volumes
|
||||
continue
|
||||
}
|
||||
|
||||
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolume had no name")
|
||||
}
|
||||
|
||||
pv, err := pl.pvLister.Get(pvName)
|
||||
if s := getErrorAsStatus(err); !s.IsSuccess() {
|
||||
return nil, s
|
||||
}
|
||||
podPVTopologies = append(podPVTopologies, pl.getPVTopologies(logger, pv)...)
|
||||
}
|
||||
return podPVTopologies, nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *VolumeZone) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Filter invoked at the filter extension point.
|
||||
//
|
||||
// It evaluates if a pod can fit due to the volumes it requests, given
|
||||
// that some volumes may have zone scheduling constraints. The requirement is that any
|
||||
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
|
||||
// the node to have more zone-label constraints (for example, a hypothetical replicated
|
||||
// volume might allow region-wide access)
|
||||
//
|
||||
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
|
||||
// only on the bound PersistentVolume.
|
||||
//
|
||||
// Working with volumes declared inline in the pod specification (i.e. not
|
||||
// using a PersistentVolume) is likely to be harder, as it would require
|
||||
// determining the zone of a volume during scheduling, and that is likely to
|
||||
// require calling out to the cloud provider. It seems that we are moving away
|
||||
// from inline volume declarations anyway.
|
||||
func (pl *VolumeZone) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
logger := klog.FromContext(ctx)
|
||||
// If a pod doesn't have any volume attached to it, the predicate will always be true.
|
||||
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
|
||||
if len(pod.Spec.Volumes) == 0 {
|
||||
return nil
|
||||
}
|
||||
var podPVTopologies []pvTopology
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
// Fallback to calculate pv list here
|
||||
var status *framework.Status
|
||||
podPVTopologies, status = pl.getPVbyPod(logger, pod)
|
||||
if !status.IsSuccess() {
|
||||
return status
|
||||
}
|
||||
} else {
|
||||
podPVTopologies = state.podPVTopologies
|
||||
}
|
||||
|
||||
node := nodeInfo.Node()
|
||||
hasAnyNodeConstraint := false
|
||||
for _, topologyLabel := range topologyLabels {
|
||||
if _, ok := node.Labels[topologyLabel]; ok {
|
||||
hasAnyNodeConstraint = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !hasAnyNodeConstraint {
|
||||
// The node has no zone constraints, so we're OK to schedule.
|
||||
// This is to handle a single-zone cluster scenario where the node may not have any topology labels.
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, pvTopology := range podPVTopologies {
|
||||
v, ok := node.Labels[pvTopology.key]
|
||||
if !ok {
|
||||
// if we can't match the beta label, try to match pv's beta label with node's ga label
|
||||
v, ok = node.Labels[translateToGALabel(pvTopology.key)]
|
||||
}
|
||||
if !ok || !pvTopology.values.Has(v) {
|
||||
logger.V(10).Info("Won't schedule pod onto node due to volume (mismatch on label key)", "pod", klog.KObj(pod), "node", klog.KObj(node), "PV", klog.KRef("", pvTopology.pvName), "PVLabelKey", pvTopology.key)
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonConflict)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getStateData(cs *framework.CycleState) (*stateData, error) {
|
||||
state, err := cs.Read(preFilterStateKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s, ok := state.(*stateData)
|
||||
if !ok {
|
||||
return nil, errors.New("unable to convert state into stateData")
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func getErrorAsStatus(err error) *framework.Status {
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
|
||||
}
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// EventsToRegister returns the possible events that may make a Pod
|
||||
// failed by this plugin schedulable.
|
||||
func (pl *VolumeZone) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
|
||||
// A new node or updating a node's volume zone labels may make a pod schedulable.
|
||||
// A note about UpdateNodeTaint event:
|
||||
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
|
||||
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
|
||||
// See: https://github.com/kubernetes/kubernetes/issues/109437
|
||||
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
|
||||
if pl.enableSchedulingQueueHint {
|
||||
// preCheck is not used when QHint is enabled.
|
||||
nodeActionType = framework.Add | framework.UpdateNodeLabel
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
// New storageClass with bind mode `VolumeBindingWaitForFirstConsumer` will make a pod schedulable.
|
||||
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
|
||||
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterStorageClassAdded},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
|
||||
// A new pvc may make a pod schedulable.
|
||||
// Also, if pvc's VolumeName is filled, that also could make a pod schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
|
||||
// A new pv or updating a pv's volume zone labels may make a pod schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// getPersistentVolumeClaimNameFromPod gets pvc names bound to a pod.
|
||||
func (pl *VolumeZone) getPersistentVolumeClaimNameFromPod(pod *v1.Pod) []string {
|
||||
var pvcNames []string
|
||||
for i := range pod.Spec.Volumes {
|
||||
volume := pod.Spec.Volumes[i]
|
||||
if volume.PersistentVolumeClaim == nil {
|
||||
continue
|
||||
}
|
||||
pvcName := volume.PersistentVolumeClaim.ClaimName
|
||||
pvcNames = append(pvcNames, pvcName)
|
||||
}
|
||||
return pvcNames
|
||||
}
|
||||
|
||||
// isSchedulableAfterPersistentVolumeClaimChange is invoked whenever a PersistentVolumeClaim added or updated.
|
||||
// It checks whether the change of PVC has made a previously unschedulable pod schedulable.
|
||||
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
|
||||
}
|
||||
if pl.isPVCRequestedFromPod(logger, modifiedPVC, pod) {
|
||||
logger.V(5).Info("PVC that is referred from the pod was created or updated, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("PVC irrelevant to the Pod was created or updated, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// isPVCRequestedFromPod verifies if the PVC is requested from a given Pod.
|
||||
func (pl *VolumeZone) isPVCRequestedFromPod(logger klog.Logger, pvc *v1.PersistentVolumeClaim, pod *v1.Pod) bool {
|
||||
if (pvc == nil) || (pod.Namespace != pvc.Namespace) {
|
||||
return false
|
||||
}
|
||||
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
|
||||
for _, pvcName := range pvcNames {
|
||||
if pvc.Name == pvcName {
|
||||
logger.V(5).Info("PVC is referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
|
||||
return true
|
||||
}
|
||||
}
|
||||
logger.V(5).Info("PVC is not referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
|
||||
return false
|
||||
}
|
||||
|
||||
// isSchedulableAfterStorageClassAdded is invoked whenever a StorageClass is added.
|
||||
// It checks whether the addition of StorageClass has made a previously unschedulable pod schedulable.
|
||||
// Only a new StorageClass with WaitForFirstConsumer will cause a pod to become schedulable.
|
||||
func (pl *VolumeZone) isSchedulableAfterStorageClassAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, addedStorageClass, err := util.As[*storage.StorageClass](nil, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterStorageClassAdded: %w", err)
|
||||
}
|
||||
if (addedStorageClass.VolumeBindingMode == nil) || (*addedStorageClass.VolumeBindingMode != storage.VolumeBindingWaitForFirstConsumer) {
|
||||
logger.V(5).Info("StorageClass is created, but its VolumeBindingMode is not waitForFirstConsumer, which doesn't make the pod schedulable", "storageClass", klog.KObj(addedStorageClass), "pod", klog.KObj(pod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("StorageClass with waitForFirstConsumer mode was created and it might make this pod schedulable", "pod", klog.KObj(pod), "StorageClass", klog.KObj(addedStorageClass))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterPersistentVolumeChange is invoked whenever a PersistentVolume added or updated.
|
||||
// It checks whether the change of PV has made a previously unschedulable pod schedulable.
|
||||
// Changing the PV topology labels could cause the pod to become schedulable.
|
||||
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
originalPV, modifiedPV, err := util.As[*v1.PersistentVolume](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeChange: %w", err)
|
||||
}
|
||||
if originalPV == nil {
|
||||
logger.V(5).Info("PV is newly created, which might make the pod schedulable")
|
||||
return framework.Queue, nil
|
||||
}
|
||||
originalPVTopologies := pl.getPVTopologies(logger, originalPV)
|
||||
modifiedPVTopologies := pl.getPVTopologies(logger, modifiedPV)
|
||||
if !reflect.DeepEqual(originalPVTopologies, modifiedPVTopologies) {
|
||||
logger.V(5).Info("PV's topology was updated, which might make the pod schedulable.", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
logger.V(5).Info("PV was updated, but the topology is unchanged, which it doesn't make the pod schedulable", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// getPVTopologies retrieves pvTopology from a given PV and returns the array
|
||||
// This function doesn't check spec.nodeAffinity
|
||||
// because it's read-only after creation and thus cannot be updated
|
||||
// and nodeAffinity is being handled in node affinity plugin
|
||||
func (pl *VolumeZone) getPVTopologies(logger klog.Logger, pv *v1.PersistentVolume) []pvTopology {
|
||||
podPVTopologies := make([]pvTopology, 0)
|
||||
for _, key := range topologyLabels {
|
||||
if value, ok := pv.ObjectMeta.Labels[key]; ok {
|
||||
labelZonesSet, err := volumehelpers.LabelZonesToSet(value)
|
||||
if err != nil {
|
||||
logger.V(5).Info("failed to parse PV's topology label, ignoring the label", "label", fmt.Sprintf("%s:%s", key, value), "err", err)
|
||||
continue
|
||||
}
|
||||
podPVTopologies = append(podPVTopologies, pvTopology{
|
||||
pvName: pv.Name,
|
||||
key: key,
|
||||
values: sets.Set[string](labelZonesSet),
|
||||
})
|
||||
}
|
||||
}
|
||||
return podPVTopologies
|
||||
}
|
||||
|
||||
// New initializes a new plugin and returns it.
|
||||
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
|
||||
informerFactory := handle.SharedInformerFactory()
|
||||
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
|
||||
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
|
||||
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
|
||||
return &VolumeZone{
|
||||
pvLister: pvLister,
|
||||
pvcLister: pvcLister,
|
||||
scLister: scLister,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
}, nil
|
||||
}
|
738
vendor/k8s.io/kubernetes/pkg/scheduler/framework/preemption/preemption.go
generated
vendored
Normal file
738
vendor/k8s.io/kubernetes/pkg/scheduler/framework/preemption/preemption.go
generated
vendored
Normal file
@ -0,0 +1,738 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package preemption
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
policy "k8s.io/api/policy/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
policylisters "k8s.io/client-go/listers/policy/v1"
|
||||
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
||||
"k8s.io/klog/v2"
|
||||
extenderv1 "k8s.io/kube-scheduler/extender/v1"
|
||||
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// Candidate represents a nominated node on which the preemptor can be scheduled,
|
||||
// along with the list of victims that should be evicted for the preemptor to fit the node.
|
||||
type Candidate interface {
|
||||
// Victims wraps a list of to-be-preempted Pods and the number of PDB violation.
|
||||
Victims() *extenderv1.Victims
|
||||
// Name returns the target node name where the preemptor gets nominated to run.
|
||||
Name() string
|
||||
}
|
||||
|
||||
type candidate struct {
|
||||
victims *extenderv1.Victims
|
||||
name string
|
||||
}
|
||||
|
||||
// Victims returns s.victims.
|
||||
func (s *candidate) Victims() *extenderv1.Victims {
|
||||
return s.victims
|
||||
}
|
||||
|
||||
// Name returns s.name.
|
||||
func (s *candidate) Name() string {
|
||||
return s.name
|
||||
}
|
||||
|
||||
type candidateList struct {
|
||||
idx int32
|
||||
items []Candidate
|
||||
}
|
||||
|
||||
func newCandidateList(size int32) *candidateList {
|
||||
return &candidateList{idx: -1, items: make([]Candidate, size)}
|
||||
}
|
||||
|
||||
// add adds a new candidate to the internal array atomically.
|
||||
func (cl *candidateList) add(c *candidate) {
|
||||
if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) {
|
||||
cl.items[idx] = c
|
||||
}
|
||||
}
|
||||
|
||||
// size returns the number of candidate stored. Note that some add() operations
|
||||
// might still be executing when this is called, so care must be taken to
|
||||
// ensure that all add() operations complete before accessing the elements of
|
||||
// the list.
|
||||
func (cl *candidateList) size() int32 {
|
||||
n := atomic.LoadInt32(&cl.idx) + 1
|
||||
if n >= int32(len(cl.items)) {
|
||||
n = int32(len(cl.items))
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// get returns the internal candidate array. This function is NOT atomic and
|
||||
// assumes that all add() operations have been completed.
|
||||
func (cl *candidateList) get() []Candidate {
|
||||
return cl.items[:cl.size()]
|
||||
}
|
||||
|
||||
// Interface is expected to be implemented by different preemption plugins as all those member
|
||||
// methods might have different behavior compared with the default preemption.
|
||||
type Interface interface {
|
||||
// GetOffsetAndNumCandidates chooses a random offset and calculates the number of candidates that should be
|
||||
// shortlisted for dry running preemption.
|
||||
GetOffsetAndNumCandidates(nodes int32) (int32, int32)
|
||||
// CandidatesToVictimsMap builds a map from the target node to a list of to-be-preempted Pods and the number of PDB violation.
|
||||
CandidatesToVictimsMap(candidates []Candidate) map[string]*extenderv1.Victims
|
||||
// PodEligibleToPreemptOthers returns one bool and one string. The bool indicates whether this pod should be considered for
|
||||
// preempting other pods or not. The string includes the reason if this pod isn't eligible.
|
||||
PodEligibleToPreemptOthers(ctx context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string)
|
||||
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
|
||||
// for "pod" to be scheduled.
|
||||
// Note that both `state` and `nodeInfo` are deep copied.
|
||||
SelectVictimsOnNode(ctx context.Context, state *framework.CycleState,
|
||||
pod *v1.Pod, nodeInfo *framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status)
|
||||
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
|
||||
// The ordered score functions will be processed one by one iff we find more than one node with the highest score.
|
||||
// Default score functions will be processed if nil returned here for backwards-compatibility.
|
||||
OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64
|
||||
}
|
||||
|
||||
type Evaluator struct {
|
||||
PluginName string
|
||||
Handler framework.Handle
|
||||
PodLister corelisters.PodLister
|
||||
PdbLister policylisters.PodDisruptionBudgetLister
|
||||
|
||||
enableAsyncPreemption bool
|
||||
mu sync.RWMutex
|
||||
// preempting is a set that records the pods that are currently triggering preemption asynchronously,
|
||||
// which is used to prevent the pods from entering the scheduling cycle meanwhile.
|
||||
preempting sets.Set[types.UID]
|
||||
|
||||
// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
|
||||
// This is exposed to be replaced during tests.
|
||||
PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
|
||||
|
||||
Interface
|
||||
}
|
||||
|
||||
func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
|
||||
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
|
||||
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
|
||||
|
||||
ev := &Evaluator{
|
||||
PluginName: names.DefaultPreemption,
|
||||
Handler: fh,
|
||||
PodLister: podLister,
|
||||
PdbLister: pdbLister,
|
||||
Interface: i,
|
||||
enableAsyncPreemption: enableAsyncPreemption,
|
||||
preempting: sets.New[types.UID](),
|
||||
}
|
||||
|
||||
// PreemptPod actually makes API calls to preempt a specific Pod.
|
||||
//
|
||||
// We implement it here directly, rather than creating a separate method like ev.preemptPod(...)
|
||||
// to prevent the misuse of the PreemptPod function.
|
||||
ev.PreemptPod = func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
|
||||
// Otherwise we should delete the victim.
|
||||
if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
|
||||
waitingPod.Reject(pluginName, "preempted")
|
||||
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
|
||||
} else {
|
||||
condition := &v1.PodCondition{
|
||||
Type: v1.DisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: v1.PodReasonPreemptionByScheduler,
|
||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
|
||||
}
|
||||
newStatus := victim.Status.DeepCopy()
|
||||
updated := apipod.UpdatePodCondition(newStatus, condition)
|
||||
if updated {
|
||||
if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
|
||||
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
logger.Error(err, "Tried to preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
|
||||
return err
|
||||
}
|
||||
logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||
return nil
|
||||
}
|
||||
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
|
||||
}
|
||||
|
||||
ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return ev
|
||||
}
|
||||
|
||||
// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
|
||||
func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
|
||||
ev.mu.RLock()
|
||||
defer ev.mu.RUnlock()
|
||||
|
||||
return ev.preempting.Has(podUID)
|
||||
}
|
||||
|
||||
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
|
||||
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
|
||||
//
|
||||
// - <nil, Error>. This denotes it's a transient/rare error that may be self-healed in future cycles.
|
||||
//
|
||||
// - <nil, Unschedulable>. This status is mostly as expected like the preemptor is waiting for the
|
||||
// victims to be fully terminated.
|
||||
//
|
||||
// - In both cases above, a nil PostFilterResult is returned to keep the pod's nominatedNodeName unchanged.
|
||||
//
|
||||
// - <non-nil PostFilterResult, Unschedulable>. It indicates the pod cannot be scheduled even with preemption.
|
||||
// In this case, a non-nil PostFilterResult is returned and result.NominatingMode instructs how to deal with
|
||||
// the nominatedNodeName.
|
||||
//
|
||||
// - <non-nil PostFilterResult, Success>. It's the regular happy path
|
||||
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
|
||||
func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
// 0) Fetch the latest version of <pod>.
|
||||
// It's safe to directly fetch pod here. Because the informer cache has already been
|
||||
// initialized when creating the Scheduler obj.
|
||||
// However, tests may need to manually initialize the shared pod informer.
|
||||
podNamespace, podName := pod.Namespace, pod.Name
|
||||
pod, err := ev.PodLister.Pods(pod.Namespace).Get(pod.Name)
|
||||
if err != nil {
|
||||
logger.Error(err, "Could not get the updated preemptor pod object", "pod", klog.KRef(podNamespace, podName))
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
// 1) Ensure the preemptor is eligible to preempt other pods.
|
||||
nominatedNodeStatus := m.Get(pod.Status.NominatedNodeName)
|
||||
if ok, msg := ev.PodEligibleToPreemptOthers(ctx, pod, nominatedNodeStatus); !ok {
|
||||
logger.V(5).Info("Pod is not eligible for preemption", "pod", klog.KObj(pod), "reason", msg)
|
||||
return nil, framework.NewStatus(framework.Unschedulable, msg)
|
||||
}
|
||||
|
||||
// 2) Find all preemption candidates.
|
||||
allNodes, err := ev.Handler.SnapshotSharedLister().NodeInfos().List()
|
||||
if err != nil {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
|
||||
if err != nil && len(candidates) == 0 {
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
|
||||
// Return a FitError only when there are no candidates that fit the pod.
|
||||
if len(candidates) == 0 {
|
||||
fitError := &framework.FitError{
|
||||
Pod: pod,
|
||||
NumAllNodes: len(allNodes),
|
||||
Diagnosis: framework.Diagnosis{
|
||||
NodeToStatus: nodeToStatusMap,
|
||||
// Leave UnschedulablePlugins or PendingPlugins as nil as it won't be used on moving Pods.
|
||||
},
|
||||
}
|
||||
fitError.Diagnosis.NodeToStatus.SetAbsentNodesStatus(framework.NewStatus(framework.UnschedulableAndUnresolvable, "Preemption is not helpful for scheduling"))
|
||||
// Specify nominatedNodeName to clear the pod's nominatedNodeName status, if applicable.
|
||||
return framework.NewPostFilterResultWithNominatedNode(""), framework.NewStatus(framework.Unschedulable, fitError.Error())
|
||||
}
|
||||
|
||||
// 3) Interact with registered Extenders to filter out some candidates if needed.
|
||||
candidates, status := ev.callExtenders(logger, pod, candidates)
|
||||
if !status.IsSuccess() {
|
||||
return nil, status
|
||||
}
|
||||
|
||||
// 4) Find the best candidate.
|
||||
bestCandidate := ev.SelectCandidate(ctx, candidates)
|
||||
if bestCandidate == nil || len(bestCandidate.Name()) == 0 {
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
|
||||
}
|
||||
|
||||
logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
|
||||
|
||||
// 5) Perform preparation work before nominating the selected candidate.
|
||||
if ev.enableAsyncPreemption {
|
||||
ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName)
|
||||
} else {
|
||||
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
|
||||
return nil, status
|
||||
}
|
||||
}
|
||||
|
||||
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
|
||||
}
|
||||
|
||||
// FindCandidates calculates a slice of preemption candidates.
|
||||
// Each candidate is executable to make the given <pod> schedulable.
|
||||
func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
|
||||
if len(allNodes) == 0 {
|
||||
return nil, nil, errors.New("no nodes available")
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
// Get a list of nodes with failed predicates (Unschedulable) that may be satisfied by removing pods from the node.
|
||||
potentialNodes, err := m.NodesForStatusCode(ev.Handler.SnapshotSharedLister().NodeInfos(), framework.Unschedulable)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if len(potentialNodes) == 0 {
|
||||
logger.V(3).Info("Preemption will not help schedule pod on any node", "pod", klog.KObj(pod))
|
||||
// In this case, we should clean-up any existing nominated node name of the pod.
|
||||
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), pod); err != nil {
|
||||
logger.Error(err, "Could not clear the nominatedNodeName field of pod", "pod", klog.KObj(pod))
|
||||
// We do not return as this error is not critical.
|
||||
}
|
||||
return nil, framework.NewDefaultNodeToStatus(), nil
|
||||
}
|
||||
|
||||
pdbs, err := getPodDisruptionBudgets(ev.PdbLister)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
|
||||
return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
|
||||
}
|
||||
|
||||
// callExtenders calls given <extenders> to select the list of feasible candidates.
|
||||
// We will only check <candidates> with extenders that support preemption.
|
||||
// Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
|
||||
// node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
|
||||
func (ev *Evaluator) callExtenders(logger klog.Logger, pod *v1.Pod, candidates []Candidate) ([]Candidate, *framework.Status) {
|
||||
extenders := ev.Handler.Extenders()
|
||||
nodeLister := ev.Handler.SnapshotSharedLister().NodeInfos()
|
||||
if len(extenders) == 0 {
|
||||
return candidates, nil
|
||||
}
|
||||
|
||||
// Migrate candidate slice to victimsMap to adapt to the Extender interface.
|
||||
// It's only applicable for candidate slice that have unique nominated node name.
|
||||
victimsMap := ev.CandidatesToVictimsMap(candidates)
|
||||
if len(victimsMap) == 0 {
|
||||
return candidates, nil
|
||||
}
|
||||
for _, extender := range extenders {
|
||||
if !extender.SupportsPreemption() || !extender.IsInterested(pod) {
|
||||
continue
|
||||
}
|
||||
nodeNameToVictims, err := extender.ProcessPreemption(pod, victimsMap, nodeLister)
|
||||
if err != nil {
|
||||
if extender.IsIgnorable() {
|
||||
logger.Info("Skipped extender as it returned error and has ignorable flag set",
|
||||
"extender", extender.Name(), "err", err)
|
||||
continue
|
||||
}
|
||||
return nil, framework.AsStatus(err)
|
||||
}
|
||||
// Check if the returned victims are valid.
|
||||
for nodeName, victims := range nodeNameToVictims {
|
||||
if victims == nil || len(victims.Pods) == 0 {
|
||||
if extender.IsIgnorable() {
|
||||
delete(nodeNameToVictims, nodeName)
|
||||
logger.Info("Ignored node for which the extender didn't report victims", "node", klog.KRef("", nodeName), "extender", extender.Name())
|
||||
continue
|
||||
}
|
||||
return nil, framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeName))
|
||||
}
|
||||
}
|
||||
|
||||
// Replace victimsMap with new result after preemption. So the
|
||||
// rest of extenders can continue use it as parameter.
|
||||
victimsMap = nodeNameToVictims
|
||||
|
||||
// If node list becomes empty, no preemption can happen regardless of other extenders.
|
||||
if len(victimsMap) == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
var newCandidates []Candidate
|
||||
for nodeName := range victimsMap {
|
||||
newCandidates = append(newCandidates, &candidate{
|
||||
victims: victimsMap[nodeName],
|
||||
name: nodeName,
|
||||
})
|
||||
}
|
||||
return newCandidates, nil
|
||||
}
|
||||
|
||||
// SelectCandidate chooses the best-fit candidate from given <candidates> and return it.
|
||||
// NOTE: This method is exported for easier testing in default preemption.
|
||||
func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate) Candidate {
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
if len(candidates) == 0 {
|
||||
return nil
|
||||
}
|
||||
if len(candidates) == 1 {
|
||||
return candidates[0]
|
||||
}
|
||||
|
||||
victimsMap := ev.CandidatesToVictimsMap(candidates)
|
||||
scoreFuncs := ev.OrderedScoreFuncs(ctx, victimsMap)
|
||||
candidateNode := pickOneNodeForPreemption(logger, victimsMap, scoreFuncs)
|
||||
|
||||
// Same as candidatesToVictimsMap, this logic is not applicable for out-of-tree
|
||||
// preemption plugins that exercise different candidates on the same nominated node.
|
||||
if victims := victimsMap[candidateNode]; victims != nil {
|
||||
return &candidate{
|
||||
victims: victims,
|
||||
name: candidateNode,
|
||||
}
|
||||
}
|
||||
|
||||
// We shouldn't reach here.
|
||||
logger.Error(errors.New("no candidate selected"), "Should not reach here", "candidates", candidates)
|
||||
// To not break the whole flow, return the first candidate.
|
||||
return candidates[0]
|
||||
}
|
||||
|
||||
// prepareCandidate does some preparation work before nominating the selected candidate:
|
||||
// - Evict the victim pods
|
||||
// - Reject the victim pods if they are in waitingPod map
|
||||
// - Clear the low-priority pods' nominatedNodeName status if needed
|
||||
func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
|
||||
fh := ev.Handler
|
||||
cs := ev.Handler.ClientSet()
|
||||
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
logger := klog.FromContext(ctx)
|
||||
errCh := parallelize.NewErrorChannel()
|
||||
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
|
||||
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
|
||||
errCh.SendErrorWithCancel(err, cancel)
|
||||
}
|
||||
}, ev.PluginName)
|
||||
if err := errCh.ReceiveError(); err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
|
||||
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
|
||||
|
||||
// Lower priority pods nominated to run on this node, may no longer fit on
|
||||
// this node. So, we should remove their nomination. Removing their
|
||||
// nomination updates these pods and moves them to the active queue. It
|
||||
// lets scheduler find another place for them.
|
||||
nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name())
|
||||
if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil {
|
||||
logger.Error(err, "Cannot clear 'NominatedNodeName' field")
|
||||
// We do not return as this error is not critical.
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// prepareCandidateAsync triggers a goroutine for some preparation work:
|
||||
// - Evict the victim pods
|
||||
// - Reject the victim pods if they are in waitingPod map
|
||||
// - Clear the low-priority pods' nominatedNodeName status if needed
|
||||
// The Pod won't be retried until the goroutine triggered here completes.
|
||||
//
|
||||
// See http://kep.k8s.io/4832 for how the async preemption works.
|
||||
func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) {
|
||||
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
|
||||
|
||||
// Intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
|
||||
// because this process could continue even after this scheduling cycle finishes.
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
errCh := parallelize.NewErrorChannel()
|
||||
preemptPod := func(index int) {
|
||||
victim := c.Victims().Pods[index]
|
||||
if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
|
||||
errCh.SendErrorWithCancel(err, cancel)
|
||||
}
|
||||
}
|
||||
|
||||
ev.mu.Lock()
|
||||
ev.preempting.Insert(pod.UID)
|
||||
ev.mu.Unlock()
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
go func() {
|
||||
startTime := time.Now()
|
||||
result := metrics.GoroutineResultSuccess
|
||||
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
|
||||
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
|
||||
defer func() {
|
||||
if result == metrics.GoroutineResultError {
|
||||
// When API call isn't successful, the Pod may get stuck in the unschedulable pod pool in the worst case.
|
||||
// So, we should move the Pod to the activeQ.
|
||||
ev.Handler.Activate(logger, map[string]*v1.Pod{pod.Name: pod})
|
||||
}
|
||||
}()
|
||||
defer cancel()
|
||||
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
|
||||
|
||||
// Lower priority pods nominated to run on this node, may no longer fit on
|
||||
// this node. So, we should remove their nomination. Removing their
|
||||
// nomination updates these pods and moves them to the active queue. It
|
||||
// lets scheduler find another place for them.
|
||||
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
|
||||
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
|
||||
logger.Error(err, "Cannot clear 'NominatedNodeName' field from lower priority pods on the same target node", "node", c.Name())
|
||||
result = metrics.GoroutineResultError
|
||||
// We do not return as this error is not critical.
|
||||
}
|
||||
|
||||
if len(c.Victims().Pods) == 0 {
|
||||
ev.mu.Lock()
|
||||
delete(ev.preempting, pod.UID)
|
||||
ev.mu.Unlock()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// We can evict all victims in parallel, but the last one.
|
||||
// We have to remove the pod from the preempting map before the last one is evicted
|
||||
// because, otherwise, the pod removal might be notified to the scheduling queue before
|
||||
// we remove this pod from the preempting map,
|
||||
// and the pod could end up stucking at the unschedulable pod pool
|
||||
// by all the pod removal events being ignored.
|
||||
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
|
||||
if err := errCh.ReceiveError(); err != nil {
|
||||
logger.Error(err, "Error occurred during async preemption")
|
||||
result = metrics.GoroutineResultError
|
||||
}
|
||||
|
||||
ev.mu.Lock()
|
||||
delete(ev.preempting, pod.UID)
|
||||
ev.mu.Unlock()
|
||||
|
||||
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
|
||||
logger.Error(err, "Error occurred during async preemption")
|
||||
result = metrics.GoroutineResultError
|
||||
}
|
||||
|
||||
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result)
|
||||
}()
|
||||
}
|
||||
|
||||
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
|
||||
if pdbLister != nil {
|
||||
return pdbLister.List(labels.Everything())
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// pickOneNodeForPreemption chooses one node among the given nodes.
|
||||
// It assumes pods in each map entry are ordered by decreasing priority.
|
||||
// If the scoreFuns is not empty, It picks a node based on score scoreFuns returns.
|
||||
// If the scoreFuns is empty,
|
||||
// It picks a node based on the following criteria:
|
||||
// 1. A node with minimum number of PDB violations.
|
||||
// 2. A node with minimum highest priority victim is picked.
|
||||
// 3. Ties are broken by sum of priorities of all victims.
|
||||
// 4. If there are still ties, node with the minimum number of victims is picked.
|
||||
// 5. If there are still ties, node with the latest start time of all highest priority victims is picked.
|
||||
// 6. If there are still ties, the first such node is picked (sort of randomly).
|
||||
// The 'minNodes1' and 'minNodes2' are being reused here to save the memory
|
||||
// allocation and garbage collection time.
|
||||
func pickOneNodeForPreemption(logger klog.Logger, nodesToVictims map[string]*extenderv1.Victims, scoreFuncs []func(node string) int64) string {
|
||||
if len(nodesToVictims) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
allCandidates := make([]string, 0, len(nodesToVictims))
|
||||
for node := range nodesToVictims {
|
||||
allCandidates = append(allCandidates, node)
|
||||
}
|
||||
|
||||
if len(scoreFuncs) == 0 {
|
||||
minNumPDBViolatingScoreFunc := func(node string) int64 {
|
||||
// The smaller the NumPDBViolations, the higher the score.
|
||||
return -nodesToVictims[node].NumPDBViolations
|
||||
}
|
||||
minHighestPriorityScoreFunc := func(node string) int64 {
|
||||
// highestPodPriority is the highest priority among the victims on this node.
|
||||
highestPodPriority := corev1helpers.PodPriority(nodesToVictims[node].Pods[0])
|
||||
// The smaller the highestPodPriority, the higher the score.
|
||||
return -int64(highestPodPriority)
|
||||
}
|
||||
minSumPrioritiesScoreFunc := func(node string) int64 {
|
||||
var sumPriorities int64
|
||||
for _, pod := range nodesToVictims[node].Pods {
|
||||
// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
|
||||
// needed so that a node with a few pods with negative priority is not
|
||||
// picked over a node with a smaller number of pods with the same negative
|
||||
// priority (and similar scenarios).
|
||||
sumPriorities += int64(corev1helpers.PodPriority(pod)) + int64(math.MaxInt32+1)
|
||||
}
|
||||
// The smaller the sumPriorities, the higher the score.
|
||||
return -sumPriorities
|
||||
}
|
||||
minNumPodsScoreFunc := func(node string) int64 {
|
||||
// The smaller the length of pods, the higher the score.
|
||||
return -int64(len(nodesToVictims[node].Pods))
|
||||
}
|
||||
latestStartTimeScoreFunc := func(node string) int64 {
|
||||
// Get the earliest start time of all pods on the current node.
|
||||
earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node])
|
||||
if earliestStartTimeOnNode == nil {
|
||||
logger.Error(errors.New("earliestStartTime is nil for node"), "Should not reach here", "node", node)
|
||||
return int64(math.MinInt64)
|
||||
}
|
||||
// The bigger the earliestStartTimeOnNode, the higher the score.
|
||||
return earliestStartTimeOnNode.UnixNano()
|
||||
}
|
||||
|
||||
// Each scoreFunc scores the nodes according to specific rules and keeps the name of the node
|
||||
// with the highest score. If and only if the scoreFunc has more than one node with the highest
|
||||
// score, we will execute the other scoreFunc in order of precedence.
|
||||
scoreFuncs = []func(string) int64{
|
||||
// A node with a minimum number of PDB is preferable.
|
||||
minNumPDBViolatingScoreFunc,
|
||||
// A node with a minimum highest priority victim is preferable.
|
||||
minHighestPriorityScoreFunc,
|
||||
// A node with the smallest sum of priorities is preferable.
|
||||
minSumPrioritiesScoreFunc,
|
||||
// A node with the minimum number of pods is preferable.
|
||||
minNumPodsScoreFunc,
|
||||
// A node with the latest start time of all highest priority victims is preferable.
|
||||
latestStartTimeScoreFunc,
|
||||
// If there are still ties, then the first Node in the list is selected.
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range scoreFuncs {
|
||||
selectedNodes := []string{}
|
||||
maxScore := int64(math.MinInt64)
|
||||
for _, node := range allCandidates {
|
||||
score := f(node)
|
||||
if score > maxScore {
|
||||
maxScore = score
|
||||
selectedNodes = []string{}
|
||||
}
|
||||
if score == maxScore {
|
||||
selectedNodes = append(selectedNodes, node)
|
||||
}
|
||||
}
|
||||
if len(selectedNodes) == 1 {
|
||||
return selectedNodes[0]
|
||||
}
|
||||
allCandidates = selectedNodes
|
||||
}
|
||||
|
||||
return allCandidates[0]
|
||||
}
|
||||
|
||||
// getLowerPriorityNominatedPods returns pods whose priority is smaller than the
|
||||
// priority of the given "pod" and are nominated to run on the given node.
|
||||
// Note: We could possibly check if the nominated lower priority pods still fit
|
||||
// and return those that no longer fit, but that would require lots of
|
||||
// manipulation of NodeInfo and PreFilter state per nominated pod. It may not be
|
||||
// worth the complexity, especially because we generally expect to have a very
|
||||
// small number of nominated pods per node.
|
||||
func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod {
|
||||
podInfos := pn.NominatedPodsForNode(nodeName)
|
||||
|
||||
if len(podInfos) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var lowerPriorityPods []*v1.Pod
|
||||
podPriority := corev1helpers.PodPriority(pod)
|
||||
for _, pi := range podInfos {
|
||||
if corev1helpers.PodPriority(pi.Pod) < podPriority {
|
||||
lowerPriorityPods = append(lowerPriorityPods, pi.Pod)
|
||||
}
|
||||
}
|
||||
return lowerPriorityPods
|
||||
}
|
||||
|
||||
// DryRunPreemption simulates Preemption logic on <potentialNodes> in parallel,
|
||||
// returns preemption candidates and a map indicating filtered nodes statuses.
|
||||
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
|
||||
// candidates, ones that do not violate PDB are preferred over ones that do.
|
||||
// NOTE: This method is exported for easier testing in default preemption.
|
||||
func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
|
||||
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
|
||||
|
||||
fh := ev.Handler
|
||||
nonViolatingCandidates := newCandidateList(candidatesNum)
|
||||
violatingCandidates := newCandidateList(candidatesNum)
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
nodeStatuses := framework.NewDefaultNodeToStatus()
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
logger.V(5).Info("Dry run the preemption", "potentialNodesNumber", len(potentialNodes), "pdbsNumber", len(pdbs), "offset", offset, "candidatesNumber", candidatesNum)
|
||||
|
||||
var statusesLock sync.Mutex
|
||||
var errs []error
|
||||
checkNode := func(i int) {
|
||||
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
|
||||
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
|
||||
|
||||
stateCopy := state.Clone()
|
||||
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
|
||||
if status.IsSuccess() && len(pods) != 0 {
|
||||
victims := extenderv1.Victims{
|
||||
Pods: pods,
|
||||
NumPDBViolations: int64(numPDBViolations),
|
||||
}
|
||||
c := &candidate{
|
||||
victims: &victims,
|
||||
name: nodeInfoCopy.Node().Name,
|
||||
}
|
||||
if numPDBViolations == 0 {
|
||||
nonViolatingCandidates.add(c)
|
||||
} else {
|
||||
violatingCandidates.add(c)
|
||||
}
|
||||
nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size()
|
||||
if nvcSize > 0 && nvcSize+vcSize >= candidatesNum {
|
||||
cancel()
|
||||
}
|
||||
return
|
||||
}
|
||||
if status.IsSuccess() && len(pods) == 0 {
|
||||
status = framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeInfoCopy.Node().Name))
|
||||
}
|
||||
statusesLock.Lock()
|
||||
if status.Code() == framework.Error {
|
||||
errs = append(errs, status.AsError())
|
||||
}
|
||||
nodeStatuses.Set(nodeInfoCopy.Node().Name, status)
|
||||
statusesLock.Unlock()
|
||||
}
|
||||
fh.Parallelizer().Until(ctx, len(potentialNodes), checkNode, ev.PluginName)
|
||||
return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses, utilerrors.NewAggregate(errs)
|
||||
}
|
1671
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/framework.go
generated
vendored
Normal file
1671
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/framework.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
83
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/instrumented_plugins.go
generated
vendored
Normal file
83
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/instrumented_plugins.go
generated
vendored
Normal file
@ -0,0 +1,83 @@
|
||||
/*
|
||||
Copyright 2023 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
compbasemetrics "k8s.io/component-base/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
type instrumentedFilterPlugin struct {
|
||||
framework.FilterPlugin
|
||||
|
||||
metric compbasemetrics.CounterMetric
|
||||
}
|
||||
|
||||
var _ framework.FilterPlugin = &instrumentedFilterPlugin{}
|
||||
|
||||
func (p *instrumentedFilterPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
|
||||
p.metric.Inc()
|
||||
return p.FilterPlugin.Filter(ctx, state, pod, nodeInfo)
|
||||
}
|
||||
|
||||
type instrumentedPreFilterPlugin struct {
|
||||
framework.PreFilterPlugin
|
||||
|
||||
metric compbasemetrics.CounterMetric
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &instrumentedPreFilterPlugin{}
|
||||
|
||||
func (p *instrumentedPreFilterPlugin) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
|
||||
result, status := p.PreFilterPlugin.PreFilter(ctx, state, pod)
|
||||
if !status.IsSkip() {
|
||||
p.metric.Inc()
|
||||
}
|
||||
return result, status
|
||||
}
|
||||
|
||||
type instrumentedPreScorePlugin struct {
|
||||
framework.PreScorePlugin
|
||||
|
||||
metric compbasemetrics.CounterMetric
|
||||
}
|
||||
|
||||
var _ framework.PreScorePlugin = &instrumentedPreScorePlugin{}
|
||||
|
||||
func (p *instrumentedPreScorePlugin) PreScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
status := p.PreScorePlugin.PreScore(ctx, state, pod, nodes)
|
||||
if !status.IsSkip() {
|
||||
p.metric.Inc()
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
type instrumentedScorePlugin struct {
|
||||
framework.ScorePlugin
|
||||
|
||||
metric compbasemetrics.CounterMetric
|
||||
}
|
||||
|
||||
var _ framework.ScorePlugin = &instrumentedScorePlugin{}
|
||||
|
||||
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
p.metric.Inc()
|
||||
return p.ScorePlugin.Score(ctx, state, pod, nodeName)
|
||||
}
|
101
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/registry.go
generated
vendored
Normal file
101
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/registry.go
generated
vendored
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/json"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"sigs.k8s.io/yaml"
|
||||
)
|
||||
|
||||
// PluginFactory is a function that builds a plugin.
|
||||
type PluginFactory = func(ctx context.Context, configuration runtime.Object, f framework.Handle) (framework.Plugin, error)
|
||||
|
||||
// PluginFactoryWithFts is a function that builds a plugin with certain feature gates.
|
||||
type PluginFactoryWithFts func(context.Context, runtime.Object, framework.Handle, plfeature.Features) (framework.Plugin, error)
|
||||
|
||||
// FactoryAdapter can be used to inject feature gates for a plugin that needs
|
||||
// them when the caller expects the older PluginFactory method.
|
||||
func FactoryAdapter(fts plfeature.Features, withFts PluginFactoryWithFts) PluginFactory {
|
||||
return func(ctx context.Context, plArgs runtime.Object, fh framework.Handle) (framework.Plugin, error) {
|
||||
return withFts(ctx, plArgs, fh, fts)
|
||||
}
|
||||
}
|
||||
|
||||
// DecodeInto decodes configuration whose type is *runtime.Unknown to the interface into.
|
||||
func DecodeInto(obj runtime.Object, into interface{}) error {
|
||||
if obj == nil {
|
||||
return nil
|
||||
}
|
||||
configuration, ok := obj.(*runtime.Unknown)
|
||||
if !ok {
|
||||
return fmt.Errorf("want args of type runtime.Unknown, got %T", obj)
|
||||
}
|
||||
if configuration.Raw == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch configuration.ContentType {
|
||||
// If ContentType is empty, it means ContentTypeJSON by default.
|
||||
case runtime.ContentTypeJSON, "":
|
||||
return json.Unmarshal(configuration.Raw, into)
|
||||
case runtime.ContentTypeYAML:
|
||||
return yaml.Unmarshal(configuration.Raw, into)
|
||||
default:
|
||||
return fmt.Errorf("not supported content type %s", configuration.ContentType)
|
||||
}
|
||||
}
|
||||
|
||||
// Registry is a collection of all available plugins. The framework uses a
|
||||
// registry to enable and initialize configured plugins.
|
||||
// All plugins must be in the registry before initializing the framework.
|
||||
type Registry map[string]PluginFactory
|
||||
|
||||
// Register adds a new plugin to the registry. If a plugin with the same name
|
||||
// exists, it returns an error.
|
||||
func (r Registry) Register(name string, factory PluginFactory) error {
|
||||
if _, ok := r[name]; ok {
|
||||
return fmt.Errorf("a plugin named %v already exists", name)
|
||||
}
|
||||
r[name] = factory
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unregister removes an existing plugin from the registry. If no plugin with
|
||||
// the provided name exists, it returns an error.
|
||||
func (r Registry) Unregister(name string) error {
|
||||
if _, ok := r[name]; !ok {
|
||||
return fmt.Errorf("no plugin named %v exists", name)
|
||||
}
|
||||
delete(r, name)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Merge merges the provided registry to the current one.
|
||||
func (r Registry) Merge(in Registry) error {
|
||||
for name, factory := range in {
|
||||
if err := r.Register(name, factory); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
165
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/waiting_pods_map.go
generated
vendored
Normal file
165
vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/waiting_pods_map.go
generated
vendored
Normal file
@ -0,0 +1,165 @@
|
||||
/*
|
||||
Copyright 2019 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package runtime
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
)
|
||||
|
||||
// waitingPodsMap a thread-safe map used to maintain pods waiting in the permit phase.
|
||||
type waitingPodsMap struct {
|
||||
pods map[types.UID]*waitingPod
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewWaitingPodsMap returns a new waitingPodsMap.
|
||||
func NewWaitingPodsMap() *waitingPodsMap {
|
||||
return &waitingPodsMap{
|
||||
pods: make(map[types.UID]*waitingPod),
|
||||
}
|
||||
}
|
||||
|
||||
// add a new WaitingPod to the map.
|
||||
func (m *waitingPodsMap) add(wp *waitingPod) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.pods[wp.GetPod().UID] = wp
|
||||
}
|
||||
|
||||
// remove a WaitingPod from the map.
|
||||
func (m *waitingPodsMap) remove(uid types.UID) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
delete(m.pods, uid)
|
||||
}
|
||||
|
||||
// get a WaitingPod from the map.
|
||||
func (m *waitingPodsMap) get(uid types.UID) *waitingPod {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
return m.pods[uid]
|
||||
}
|
||||
|
||||
// iterate acquires a read lock and iterates over the WaitingPods map.
|
||||
func (m *waitingPodsMap) iterate(callback func(framework.WaitingPod)) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
for _, v := range m.pods {
|
||||
callback(v)
|
||||
}
|
||||
}
|
||||
|
||||
// waitingPod represents a pod waiting in the permit phase.
|
||||
type waitingPod struct {
|
||||
pod *v1.Pod
|
||||
pendingPlugins map[string]*time.Timer
|
||||
s chan *framework.Status
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
var _ framework.WaitingPod = &waitingPod{}
|
||||
|
||||
// newWaitingPod returns a new waitingPod instance.
|
||||
func newWaitingPod(pod *v1.Pod, pluginsMaxWaitTime map[string]time.Duration) *waitingPod {
|
||||
wp := &waitingPod{
|
||||
pod: pod,
|
||||
// Allow() and Reject() calls are non-blocking. This property is guaranteed
|
||||
// by using non-blocking send to this channel. This channel has a buffer of size 1
|
||||
// to ensure that non-blocking send will not be ignored - possible situation when
|
||||
// receiving from this channel happens after non-blocking send.
|
||||
s: make(chan *framework.Status, 1),
|
||||
}
|
||||
|
||||
wp.pendingPlugins = make(map[string]*time.Timer, len(pluginsMaxWaitTime))
|
||||
// The time.AfterFunc calls wp.Reject which iterates through pendingPlugins map. Acquire the
|
||||
// lock here so that time.AfterFunc can only execute after newWaitingPod finishes.
|
||||
wp.mu.Lock()
|
||||
defer wp.mu.Unlock()
|
||||
for k, v := range pluginsMaxWaitTime {
|
||||
plugin, waitTime := k, v
|
||||
wp.pendingPlugins[plugin] = time.AfterFunc(waitTime, func() {
|
||||
msg := fmt.Sprintf("rejected due to timeout after waiting %v at plugin %v",
|
||||
waitTime, plugin)
|
||||
wp.Reject(plugin, msg)
|
||||
})
|
||||
}
|
||||
|
||||
return wp
|
||||
}
|
||||
|
||||
// GetPod returns a reference to the waiting pod.
|
||||
func (w *waitingPod) GetPod() *v1.Pod {
|
||||
return w.pod
|
||||
}
|
||||
|
||||
// GetPendingPlugins returns a list of pending permit plugin's name.
|
||||
func (w *waitingPod) GetPendingPlugins() []string {
|
||||
w.mu.RLock()
|
||||
defer w.mu.RUnlock()
|
||||
plugins := make([]string, 0, len(w.pendingPlugins))
|
||||
for p := range w.pendingPlugins {
|
||||
plugins = append(plugins, p)
|
||||
}
|
||||
|
||||
return plugins
|
||||
}
|
||||
|
||||
// Allow declares the waiting pod is allowed to be scheduled by plugin pluginName.
|
||||
// If this is the last remaining plugin to allow, then a success signal is delivered
|
||||
// to unblock the pod.
|
||||
func (w *waitingPod) Allow(pluginName string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if timer, exist := w.pendingPlugins[pluginName]; exist {
|
||||
timer.Stop()
|
||||
delete(w.pendingPlugins, pluginName)
|
||||
}
|
||||
|
||||
// Only signal success status after all plugins have allowed
|
||||
if len(w.pendingPlugins) != 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// The select clause works as a non-blocking send.
|
||||
// If there is no receiver, it's a no-op (default case).
|
||||
select {
|
||||
case w.s <- framework.NewStatus(framework.Success, ""):
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// Reject declares the waiting pod unschedulable.
|
||||
func (w *waitingPod) Reject(pluginName, msg string) {
|
||||
w.mu.RLock()
|
||||
defer w.mu.RUnlock()
|
||||
for _, timer := range w.pendingPlugins {
|
||||
timer.Stop()
|
||||
}
|
||||
|
||||
// The select clause works as a non-blocking send.
|
||||
// If there is no receiver, it's a no-op (default case).
|
||||
select {
|
||||
case w.s <- framework.NewStatus(framework.Unschedulable, msg).WithPlugin(pluginName):
|
||||
default:
|
||||
}
|
||||
}
|
1302
vendor/k8s.io/kubernetes/pkg/scheduler/framework/types.go
generated
vendored
Normal file
1302
vendor/k8s.io/kubernetes/pkg/scheduler/framework/types.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user