mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-06-14 18:53:35 +00:00
rebase: update replaced k8s.io modules to v0.33.0
Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
committed by
mergify[bot]
parent
dd77e72800
commit
107407b44b
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/doc.go
generated
vendored
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/doc.go
generated
vendored
@ -17,4 +17,4 @@ limitations under the License.
|
||||
// +k8s:deepcopy-gen=package
|
||||
// +groupName=kubescheduler.config.k8s.io
|
||||
|
||||
package config // import "k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
package config
|
||||
|
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/types_pluginargs.go
generated
vendored
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/types_pluginargs.go
generated
vendored
@ -163,7 +163,7 @@ type VolumeBindingArgs struct {
|
||||
// 1) 0 for 0 utilization
|
||||
// 2) 10 for 100 utilization
|
||||
// All points must be sorted in increasing order by utilization.
|
||||
// +featureGate=VolumeCapacityPriority
|
||||
// +featureGate=StorageCapacityScoring
|
||||
// +optional
|
||||
Shape []UtilizationShapePoint
|
||||
}
|
||||
|
6
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/v1/defaults.go
generated
vendored
6
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/v1/defaults.go
generated
vendored
@ -192,15 +192,15 @@ func SetDefaults_VolumeBindingArgs(obj *configv1.VolumeBindingArgs) {
|
||||
if obj.BindTimeoutSeconds == nil {
|
||||
obj.BindTimeoutSeconds = ptr.To[int64](600)
|
||||
}
|
||||
if len(obj.Shape) == 0 && feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority) {
|
||||
if len(obj.Shape) == 0 && feature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring) {
|
||||
obj.Shape = []configv1.UtilizationShapePoint{
|
||||
{
|
||||
Utilization: 0,
|
||||
Score: 0,
|
||||
Score: int32(config.MaxCustomPriorityScore),
|
||||
},
|
||||
{
|
||||
Utilization: 100,
|
||||
Score: int32(config.MaxCustomPriorityScore),
|
||||
Score: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/v1/doc.go
generated
vendored
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/apis/config/v1/doc.go
generated
vendored
@ -21,4 +21,4 @@ limitations under the License.
|
||||
// +k8s:defaulter-gen-input=k8s.io/kube-scheduler/config/v1
|
||||
// +groupName=kubescheduler.config.k8s.io
|
||||
|
||||
package v1 // import "k8s.io/kubernetes/pkg/scheduler/apis/config/v1"
|
||||
package v1
|
||||
|
@ -261,13 +261,13 @@ func ValidateNodeAffinityArgs(path *field.Path, args *config.NodeAffinityArgs) e
|
||||
|
||||
// VolumeBindingArgsValidationOptions contains the different settings for validation.
|
||||
type VolumeBindingArgsValidationOptions struct {
|
||||
AllowVolumeCapacityPriority bool
|
||||
AllowStorageCapacityScoring bool
|
||||
}
|
||||
|
||||
// ValidateVolumeBindingArgs validates that VolumeBindingArgs are set correctly.
|
||||
func ValidateVolumeBindingArgs(path *field.Path, args *config.VolumeBindingArgs) error {
|
||||
return ValidateVolumeBindingArgsWithOptions(path, args, VolumeBindingArgsValidationOptions{
|
||||
AllowVolumeCapacityPriority: utilfeature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
|
||||
AllowStorageCapacityScoring: utilfeature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring),
|
||||
})
|
||||
}
|
||||
|
||||
@ -279,13 +279,13 @@ func ValidateVolumeBindingArgsWithOptions(path *field.Path, args *config.VolumeB
|
||||
allErrs = append(allErrs, field.Invalid(path.Child("bindTimeoutSeconds"), args.BindTimeoutSeconds, "invalid BindTimeoutSeconds, should not be a negative value"))
|
||||
}
|
||||
|
||||
if opts.AllowVolumeCapacityPriority {
|
||||
if opts.AllowStorageCapacityScoring {
|
||||
allErrs = append(allErrs, validateFunctionShape(args.Shape, path.Child("shape"))...)
|
||||
} else if args.Shape != nil {
|
||||
// When the feature is off, return an error if the config is not nil.
|
||||
// This prevents unexpected configuration from taking effect when the
|
||||
// feature turns on in the future.
|
||||
allErrs = append(allErrs, field.Invalid(path.Child("shape"), args.Shape, "unexpected field `shape`, remove it or turn on the feature gate VolumeCapacityPriority"))
|
||||
allErrs = append(allErrs, field.Invalid(path.Child("shape"), args.Shape, "unexpected field `shape`, remove it or turn on the feature gate StorageCapacityScoring"))
|
||||
}
|
||||
return allErrs.ToAggregate()
|
||||
}
|
||||
|
8
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/cache/cache.go
generated
vendored
8
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/cache/cache.go
generated
vendored
@ -757,4 +757,12 @@ func (cache *cacheImpl) updateMetrics() {
|
||||
metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
|
||||
metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
|
||||
metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
|
||||
|
||||
// we intentionally keep them with the deprecation and will remove at v1.34.
|
||||
//nolint:staticcheck
|
||||
metrics.SchedulerCacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
|
||||
//nolint:staticcheck
|
||||
metrics.SchedulerCacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
|
||||
//nolint:staticcheck
|
||||
metrics.SchedulerCacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
|
||||
}
|
||||
|
107
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/active_queue.go
generated
vendored
107
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/active_queue.go
generated
vendored
@ -20,6 +20,7 @@ import (
|
||||
"container/list"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
@ -61,14 +62,63 @@ type activeQueuer interface {
|
||||
// underLock() method should be used to protect these methods.
|
||||
type unlockedActiveQueuer interface {
|
||||
unlockedActiveQueueReader
|
||||
AddOrUpdate(pInfo *framework.QueuedPodInfo)
|
||||
// add adds a new pod to the activeQ.
|
||||
// The event should show which event triggered this addition and is used for the metric recording.
|
||||
// This method should be called in activeQueue.underLock().
|
||||
add(pInfo *framework.QueuedPodInfo, event string)
|
||||
}
|
||||
|
||||
// unlockedActiveQueueReader defines activeQ read-only methods that are not protected by the lock itself.
|
||||
// underLock() or underRLock() method should be used to protect these methods.
|
||||
type unlockedActiveQueueReader interface {
|
||||
Get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
|
||||
Has(pInfo *framework.QueuedPodInfo) bool
|
||||
// get returns the pod matching pInfo inside the activeQ.
|
||||
// Returns false if the pInfo doesn't exist in the queue.
|
||||
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
|
||||
get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
|
||||
// has returns if pInfo exists in the queue.
|
||||
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
|
||||
has(pInfo *framework.QueuedPodInfo) bool
|
||||
}
|
||||
|
||||
// unlockedActiveQueue defines activeQ methods that are not protected by the lock itself.
|
||||
// activeQueue.underLock() or activeQueue.underRLock() method should be used to protect these methods.
|
||||
type unlockedActiveQueue struct {
|
||||
queue *heap.Heap[*framework.QueuedPodInfo]
|
||||
}
|
||||
|
||||
func newUnlockedActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo]) *unlockedActiveQueue {
|
||||
return &unlockedActiveQueue{
|
||||
queue: queue,
|
||||
}
|
||||
}
|
||||
|
||||
// add adds a new pod to the activeQ.
|
||||
// The event should show which event triggered this addition and is used for the metric recording.
|
||||
// This method should be called in activeQueue.underLock().
|
||||
func (uaq *unlockedActiveQueue) add(pInfo *framework.QueuedPodInfo, event string) {
|
||||
uaq.queue.AddOrUpdate(pInfo)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
|
||||
}
|
||||
|
||||
// get returns the pod matching pInfo inside the activeQ.
|
||||
// Returns false if the pInfo doesn't exist in the queue.
|
||||
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
|
||||
func (uaq *unlockedActiveQueue) get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
|
||||
return uaq.queue.Get(pInfo)
|
||||
}
|
||||
|
||||
// has returns if pInfo exists in the queue.
|
||||
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
|
||||
func (uaq *unlockedActiveQueue) has(pInfo *framework.QueuedPodInfo) bool {
|
||||
return uaq.queue.Has(pInfo)
|
||||
}
|
||||
|
||||
// backoffQPopper defines method that is used to pop from the backoffQ when the activeQ is empty.
|
||||
type backoffQPopper interface {
|
||||
// popBackoff pops the pInfo from the podBackoffQ.
|
||||
popBackoff() (*framework.QueuedPodInfo, error)
|
||||
// len returns length of the podBackoffQ queue.
|
||||
lenBackoff() int
|
||||
}
|
||||
|
||||
// activeQueue implements activeQueuer. All of the fields have to be protected using the lock.
|
||||
@ -77,15 +127,21 @@ type activeQueue struct {
|
||||
// It protects activeQ, inFlightPods, inFlightEvents, schedulingCycle and closed fields.
|
||||
// Caution: DO NOT take "SchedulingQueue.lock" after taking "lock".
|
||||
// You should always take "SchedulingQueue.lock" first, otherwise the queue could end up in deadlock.
|
||||
// "lock" should not be taken after taking "nLock".
|
||||
// Correct locking order is: SchedulingQueue.lock > lock > nominator.nLock.
|
||||
// "lock" should not be taken after taking "backoffQueue.lock" or "nominator.nLock".
|
||||
// Correct locking order is: SchedulingQueue.lock > lock > backoffQueue.lock > nominator.nLock.
|
||||
lock sync.RWMutex
|
||||
|
||||
// activeQ is heap structure that scheduler actively looks at to find pods to
|
||||
// schedule. Head of heap is the highest priority pod.
|
||||
queue *heap.Heap[*framework.QueuedPodInfo]
|
||||
|
||||
// unlockedQueue is a wrapper of queue providing methods that are not locked themselves
|
||||
// and can be used in the underLock() or underRLock().
|
||||
unlockedQueue *unlockedActiveQueue
|
||||
|
||||
// cond is a condition that is notified when the pod is added to activeQ.
|
||||
// When SchedulerPopFromBackoffQ feature is enabled,
|
||||
// condition is also notified when the pod is added to backoffQ.
|
||||
// It is used with lock.
|
||||
cond sync.Cond
|
||||
|
||||
@ -125,15 +181,21 @@ type activeQueue struct {
|
||||
isSchedulingQueueHintEnabled bool
|
||||
|
||||
metricsRecorder metrics.MetricAsyncRecorder
|
||||
|
||||
// backoffQPopper is used to pop from backoffQ when activeQ is empty.
|
||||
// It is non-nil only when SchedulerPopFromBackoffQ feature is enabled.
|
||||
backoffQPopper backoffQPopper
|
||||
}
|
||||
|
||||
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder) *activeQueue {
|
||||
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder, backoffQPopper backoffQPopper) *activeQueue {
|
||||
aq := &activeQueue{
|
||||
queue: queue,
|
||||
inFlightPods: make(map[types.UID]*list.Element),
|
||||
inFlightEvents: list.New(),
|
||||
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
|
||||
metricsRecorder: metricRecorder,
|
||||
unlockedQueue: newUnlockedActiveQueue(queue),
|
||||
backoffQPopper: backoffQPopper,
|
||||
}
|
||||
aq.cond.L = &aq.lock
|
||||
|
||||
@ -146,7 +208,7 @@ func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueu
|
||||
func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer)) {
|
||||
aq.lock.Lock()
|
||||
defer aq.lock.Unlock()
|
||||
fn(aq.queue)
|
||||
fn(aq.unlockedQueue)
|
||||
}
|
||||
|
||||
// underLock runs the fn function under the lock.RLock.
|
||||
@ -155,7 +217,7 @@ func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer))
|
||||
func (aq *activeQueue) underRLock(fn func(unlockedActiveQ unlockedActiveQueueReader)) {
|
||||
aq.lock.RLock()
|
||||
defer aq.lock.RUnlock()
|
||||
fn(aq.queue)
|
||||
fn(aq.unlockedQueue)
|
||||
}
|
||||
|
||||
// update updates the pod in activeQ if oldPodInfo is already in the queue.
|
||||
@ -191,7 +253,13 @@ func (aq *activeQueue) pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
|
||||
}
|
||||
|
||||
func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
|
||||
var pInfo *framework.QueuedPodInfo
|
||||
for aq.queue.Len() == 0 {
|
||||
// backoffQPopper is non-nil only if SchedulerPopFromBackoffQ feature is enabled.
|
||||
// In case of non-empty backoffQ, try popping from there.
|
||||
if aq.backoffQPopper != nil && aq.backoffQPopper.lenBackoff() != 0 {
|
||||
break
|
||||
}
|
||||
// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
|
||||
// When Close() is called, the p.closed is set and the condition is broadcast,
|
||||
// which causes this loop to continue and return from the Pop().
|
||||
@ -203,9 +271,18 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo
|
||||
}
|
||||
pInfo, err := aq.queue.Pop()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if aq.backoffQPopper == nil {
|
||||
return nil, err
|
||||
}
|
||||
// Try to pop from backoffQ when activeQ is empty.
|
||||
pInfo, err = aq.backoffQPopper.popBackoff()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", framework.PopFromBackoffQ).Inc()
|
||||
}
|
||||
pInfo.Attempts++
|
||||
pInfo.BackoffExpiration = time.Time{}
|
||||
// In flight, no concurrent events yet.
|
||||
if aq.isSchedulingQueueHintEnabled {
|
||||
// If the pod is already in the map, we shouldn't overwrite the inFlightPods otherwise it'd lead to a memory leak.
|
||||
@ -354,6 +431,12 @@ func (aq *activeQueue) done(pod types.UID) {
|
||||
aq.lock.Lock()
|
||||
defer aq.lock.Unlock()
|
||||
|
||||
aq.unlockedDone(pod)
|
||||
}
|
||||
|
||||
// unlockedDone is used by the activeQueue internally and doesn't take the lock itself.
|
||||
// It assumes the lock is already taken outside before the method is called.
|
||||
func (aq *activeQueue) unlockedDone(pod types.UID) {
|
||||
inFlightPod, ok := aq.inFlightPods[pod]
|
||||
if !ok {
|
||||
// This Pod is already done()ed.
|
||||
@ -398,15 +481,15 @@ func (aq *activeQueue) done(pod types.UID) {
|
||||
|
||||
// close closes the activeQueue.
|
||||
func (aq *activeQueue) close() {
|
||||
aq.lock.Lock()
|
||||
defer aq.lock.Unlock()
|
||||
// We should call done() for all in-flight pods to clean up the inFlightEvents metrics.
|
||||
// It's safe even if the binding cycle running asynchronously calls done() afterwards
|
||||
// done() will just be a no-op.
|
||||
for pod := range aq.inFlightPods {
|
||||
aq.done(pod)
|
||||
aq.unlockedDone(pod)
|
||||
}
|
||||
aq.lock.Lock()
|
||||
aq.closed = true
|
||||
aq.lock.Unlock()
|
||||
}
|
||||
|
||||
// broadcast notifies the pop() operation that new pod(s) was added to the activeQueue.
|
||||
|
405
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/backoff_queue.go
generated
vendored
Normal file
405
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/backoff_queue.go
generated
vendored
Normal file
@ -0,0 +1,405 @@
|
||||
/*
|
||||
Copyright 2025 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package queue
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/backend/heap"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
"k8s.io/utils/clock"
|
||||
)
|
||||
|
||||
// backoffQOrderingWindowDuration is a duration of an ordering window in the podBackoffQ.
|
||||
// In each window, represented as a whole second, pods are ordered by priority.
|
||||
// It is the same as interval of flushing the pods from the podBackoffQ to the activeQ, to flush the whole windows there.
|
||||
// This works only if PopFromBackoffQ feature is enabled.
|
||||
// See the KEP-5142 (http://kep.k8s.io/5142) for rationale.
|
||||
const backoffQOrderingWindowDuration = time.Second
|
||||
|
||||
// backoffQueuer is a wrapper for backoffQ related operations.
|
||||
// Its methods that relies on the queues, take the lock inside.
|
||||
type backoffQueuer interface {
|
||||
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
|
||||
// If this returns true, the pod should not be re-tried.
|
||||
// If the pod backoff time is in the actual ordering window, it should still be backing off.
|
||||
isPodBackingoff(podInfo *framework.QueuedPodInfo) bool
|
||||
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
|
||||
popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo
|
||||
|
||||
// podInitialBackoffDuration returns initial backoff duration that pod can get.
|
||||
podInitialBackoffDuration() time.Duration
|
||||
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
|
||||
podMaxBackoffDuration() time.Duration
|
||||
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
|
||||
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
|
||||
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
|
||||
// and whole windows have to be flushed at one time without a visible latency.
|
||||
waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{})
|
||||
|
||||
// add adds the pInfo to backoffQueue.
|
||||
// The event should show which event triggered this addition and is used for the metric recording.
|
||||
// It also ensures that pInfo is not in both queues.
|
||||
add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string)
|
||||
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
|
||||
// It returns new pod info if updated, nil otherwise.
|
||||
update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo
|
||||
// delete deletes the pInfo from backoffQueue.
|
||||
// It returns true if the pod was deleted.
|
||||
delete(pInfo *framework.QueuedPodInfo) bool
|
||||
// get returns the pInfo matching given pInfoLookup, if exists.
|
||||
get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
|
||||
// has inform if pInfo exists in the queue.
|
||||
has(pInfo *framework.QueuedPodInfo) bool
|
||||
// list returns all pods that are in the queue.
|
||||
list() []*v1.Pod
|
||||
// len returns length of the queue.
|
||||
len() int
|
||||
}
|
||||
|
||||
// backoffQueue implements backoffQueuer and wraps two queues inside,
|
||||
// providing seamless access as if it were one queue.
|
||||
type backoffQueue struct {
|
||||
// lock synchronizes all operations related to backoffQ.
|
||||
// It protects both podBackoffQ and podErrorBackoffQ.
|
||||
// Caution: DO NOT take "SchedulingQueue.lock" or "activeQueue.lock" after taking "lock".
|
||||
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first, otherwise the queue could end up in deadlock.
|
||||
// "lock" should not be taken after taking "nominator.nLock".
|
||||
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > lock > nominator.nLock.
|
||||
lock sync.RWMutex
|
||||
|
||||
clock clock.WithTicker
|
||||
|
||||
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
|
||||
// are popped from this heap before the scheduler looks at activeQ
|
||||
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
|
||||
// podErrorBackoffQ is a heap ordered by error backoff expiry. Pods which have completed backoff
|
||||
// are popped from this heap before the scheduler looks at activeQ
|
||||
podErrorBackoffQ *heap.Heap[*framework.QueuedPodInfo]
|
||||
|
||||
podInitialBackoff time.Duration
|
||||
podMaxBackoff time.Duration
|
||||
// activeQLessFn is used as an eventual less function if two backoff times are equal,
|
||||
// when the SchedulerPopFromBackoffQ feature is enabled.
|
||||
activeQLessFn framework.LessFunc
|
||||
|
||||
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
|
||||
isPopFromBackoffQEnabled bool
|
||||
}
|
||||
|
||||
func newBackoffQueue(clock clock.WithTicker, podInitialBackoffDuration time.Duration, podMaxBackoffDuration time.Duration, activeQLessFn framework.LessFunc, popFromBackoffQEnabled bool) *backoffQueue {
|
||||
bq := &backoffQueue{
|
||||
clock: clock,
|
||||
podInitialBackoff: podInitialBackoffDuration,
|
||||
podMaxBackoff: podMaxBackoffDuration,
|
||||
isPopFromBackoffQEnabled: popFromBackoffQEnabled,
|
||||
activeQLessFn: activeQLessFn,
|
||||
}
|
||||
podBackoffQLessFn := bq.lessBackoffCompleted
|
||||
if popFromBackoffQEnabled {
|
||||
podBackoffQLessFn = bq.lessBackoffCompletedWithPriority
|
||||
}
|
||||
bq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, podBackoffQLessFn, metrics.NewBackoffPodsRecorder())
|
||||
bq.podErrorBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, bq.lessBackoffCompleted, metrics.NewBackoffPodsRecorder())
|
||||
|
||||
return bq
|
||||
}
|
||||
|
||||
// podInitialBackoffDuration returns initial backoff duration that pod can get.
|
||||
func (bq *backoffQueue) podInitialBackoffDuration() time.Duration {
|
||||
return bq.podInitialBackoff
|
||||
}
|
||||
|
||||
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
|
||||
func (bq *backoffQueue) podMaxBackoffDuration() time.Duration {
|
||||
return bq.podMaxBackoff
|
||||
}
|
||||
|
||||
// alignToWindow truncates the provided time to the podBackoffQ ordering window.
|
||||
// It returns the lowest possible timestamp in the window.
|
||||
func (bq *backoffQueue) alignToWindow(t time.Time) time.Time {
|
||||
if !bq.isPopFromBackoffQEnabled {
|
||||
return t
|
||||
}
|
||||
return t.Truncate(backoffQOrderingWindowDuration)
|
||||
}
|
||||
|
||||
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
|
||||
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
|
||||
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
|
||||
// and whole windows have to be flushed at one time without a visible latency.
|
||||
func (bq *backoffQueue) waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{}) {
|
||||
now := bq.clock.Now()
|
||||
// Wait until the time reaches the multiple of backoffQOrderingWindowDuration.
|
||||
durationToNextWindow := bq.alignToWindow(now.Add(backoffQOrderingWindowDuration)).Sub(now)
|
||||
timer := bq.clock.NewTimer(durationToNextWindow)
|
||||
select {
|
||||
case <-stopCh:
|
||||
timer.Stop()
|
||||
return
|
||||
case <-timer.C():
|
||||
}
|
||||
|
||||
// Run a ticker to make sure the invocations of f function
|
||||
// are aligned with the backoffQ's ordering window.
|
||||
ticker := bq.clock.NewTicker(backoffQOrderingWindowDuration)
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
f()
|
||||
|
||||
// NOTE: b/c there is no priority selection in golang
|
||||
// it is possible for this to race, meaning we could
|
||||
// trigger ticker.C and stopCh, and ticker.C select falls through.
|
||||
// In order to mitigate we re-check stopCh at the beginning
|
||||
// of every loop to prevent extra executions of f().
|
||||
select {
|
||||
case <-stopCh:
|
||||
ticker.Stop()
|
||||
return
|
||||
case <-ticker.C():
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// lessBackoffCompletedWithPriority is a less function of podBackoffQ if PopFromBackoffQ feature is enabled.
|
||||
// It orders the pods in the same BackoffOrderingWindow the same as the activeQ will do to improve popping order from backoffQ when activeQ is empty.
|
||||
func (bq *backoffQueue) lessBackoffCompletedWithPriority(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
|
||||
bo1 := bq.getBackoffTime(pInfo1)
|
||||
bo2 := bq.getBackoffTime(pInfo2)
|
||||
if !bo1.Equal(bo2) {
|
||||
return bo1.Before(bo2)
|
||||
}
|
||||
// If the backoff time is the same, sort the pod in the same manner as activeQ does.
|
||||
return bq.activeQLessFn(pInfo1, pInfo2)
|
||||
}
|
||||
|
||||
// lessBackoffCompleted is a less function of podErrorBackoffQ.
|
||||
func (bq *backoffQueue) lessBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
|
||||
bo1 := bq.getBackoffTime(pInfo1)
|
||||
bo2 := bq.getBackoffTime(pInfo2)
|
||||
return bo1.Before(bo2)
|
||||
}
|
||||
|
||||
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
|
||||
// If this returns true, the pod should not be re-tried.
|
||||
// If the pod backoff time is in the actual ordering window, it should still be backing off.
|
||||
func (bq *backoffQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
|
||||
boTime := bq.getBackoffTime(podInfo)
|
||||
// Don't use After, because in case of windows equality we want to return true.
|
||||
return !boTime.Before(bq.alignToWindow(bq.clock.Now()))
|
||||
}
|
||||
|
||||
// getBackoffTime returns the time that podInfo completes backoff.
|
||||
// It caches the result in podInfo.BackoffExpiration and returns this value in subsequent calls.
|
||||
// The cache will be cleared when this pod is poped from the scheduling queue again (i.e., at activeQ's pop),
|
||||
// because of the fact that the backoff time is calculated based on podInfo.Attempts,
|
||||
// which doesn't get changed until the pod's scheduling is retried.
|
||||
func (bq *backoffQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
|
||||
if podInfo.Attempts == 0 {
|
||||
// Don't store backoff expiration if the duration is 0
|
||||
// to correctly handle isPodBackingoff, if pod should skip backoff, when it wasn't tried at all.
|
||||
return time.Time{}
|
||||
}
|
||||
if podInfo.BackoffExpiration.IsZero() {
|
||||
duration := bq.calculateBackoffDuration(podInfo)
|
||||
podInfo.BackoffExpiration = bq.alignToWindow(podInfo.Timestamp.Add(duration))
|
||||
}
|
||||
return podInfo.BackoffExpiration
|
||||
}
|
||||
|
||||
// calculateBackoffDuration is a helper function for calculating the backoffDuration
|
||||
// based on the number of attempts the pod has made.
|
||||
func (bq *backoffQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
|
||||
if podInfo.Attempts == 0 {
|
||||
// When the Pod hasn't experienced any scheduling attempts,
|
||||
// they aren't obliged to get a backoff penalty at all.
|
||||
return 0
|
||||
}
|
||||
|
||||
duration := bq.podInitialBackoff
|
||||
for i := 1; i < podInfo.Attempts; i++ {
|
||||
// Use subtraction instead of addition or multiplication to avoid overflow.
|
||||
if duration > bq.podMaxBackoff-duration {
|
||||
return bq.podMaxBackoff
|
||||
}
|
||||
duration += duration
|
||||
}
|
||||
return duration
|
||||
}
|
||||
|
||||
func (bq *backoffQueue) popAllBackoffCompletedWithQueue(logger klog.Logger, queue *heap.Heap[*framework.QueuedPodInfo]) []*framework.QueuedPodInfo {
|
||||
var poppedPods []*framework.QueuedPodInfo
|
||||
for {
|
||||
pInfo, ok := queue.Peek()
|
||||
if !ok || pInfo == nil {
|
||||
break
|
||||
}
|
||||
pod := pInfo.Pod
|
||||
if bq.isPodBackingoff(pInfo) {
|
||||
break
|
||||
}
|
||||
_, err := queue.Pop()
|
||||
if err != nil {
|
||||
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
|
||||
break
|
||||
}
|
||||
poppedPods = append(poppedPods, pInfo)
|
||||
}
|
||||
return poppedPods
|
||||
}
|
||||
|
||||
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
|
||||
func (bq *backoffQueue) popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo {
|
||||
bq.lock.Lock()
|
||||
defer bq.lock.Unlock()
|
||||
|
||||
// Ensure both queues are called
|
||||
return append(bq.popAllBackoffCompletedWithQueue(logger, bq.podBackoffQ), bq.popAllBackoffCompletedWithQueue(logger, bq.podErrorBackoffQ)...)
|
||||
}
|
||||
|
||||
// add adds the pInfo to backoffQueue.
|
||||
// The event should show which event triggered this addition and is used for the metric recording.
|
||||
// It also ensures that pInfo is not in both queues.
|
||||
func (bq *backoffQueue) add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) {
|
||||
bq.lock.Lock()
|
||||
defer bq.lock.Unlock()
|
||||
|
||||
// If pod has empty both unschedulable plugins and pending plugins,
|
||||
// it means that it failed because of error and should be moved to podErrorBackoffQ.
|
||||
if pInfo.UnschedulablePlugins.Len() == 0 && pInfo.PendingPlugins.Len() == 0 {
|
||||
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
|
||||
// Ensure the pod is not in the podBackoffQ and report the error if it happens.
|
||||
err := bq.podBackoffQ.Delete(pInfo)
|
||||
if err == nil {
|
||||
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podBackoffQ", "pod", klog.KObj(pInfo.Pod))
|
||||
return
|
||||
}
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
|
||||
return
|
||||
}
|
||||
bq.podBackoffQ.AddOrUpdate(pInfo)
|
||||
// Ensure the pod is not in the podErrorBackoffQ and report the error if it happens.
|
||||
err := bq.podErrorBackoffQ.Delete(pInfo)
|
||||
if err == nil {
|
||||
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podErrorBackoffQ", "pod", klog.KObj(pInfo.Pod))
|
||||
return
|
||||
}
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
|
||||
}
|
||||
|
||||
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
|
||||
// It returns new pod info if updated, nil otherwise.
|
||||
func (bq *backoffQueue) update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo {
|
||||
bq.lock.Lock()
|
||||
defer bq.lock.Unlock()
|
||||
|
||||
// If the pod is in the backoff queue, update it there.
|
||||
if pInfo, exists := bq.podBackoffQ.Get(oldPodInfo); exists {
|
||||
_ = pInfo.Update(newPod)
|
||||
bq.podBackoffQ.AddOrUpdate(pInfo)
|
||||
return pInfo
|
||||
}
|
||||
// If the pod is in the error backoff queue, update it there.
|
||||
if pInfo, exists := bq.podErrorBackoffQ.Get(oldPodInfo); exists {
|
||||
_ = pInfo.Update(newPod)
|
||||
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
|
||||
return pInfo
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// delete deletes the pInfo from backoffQueue.
|
||||
// It returns true if the pod was deleted.
|
||||
func (bq *backoffQueue) delete(pInfo *framework.QueuedPodInfo) bool {
|
||||
bq.lock.Lock()
|
||||
defer bq.lock.Unlock()
|
||||
|
||||
if bq.podBackoffQ.Delete(pInfo) == nil {
|
||||
return true
|
||||
}
|
||||
return bq.podErrorBackoffQ.Delete(pInfo) == nil
|
||||
}
|
||||
|
||||
// popBackoff pops the pInfo from the podBackoffQ.
|
||||
// It returns error if the queue is empty.
|
||||
// This doesn't pop the pods from the podErrorBackoffQ.
|
||||
func (bq *backoffQueue) popBackoff() (*framework.QueuedPodInfo, error) {
|
||||
bq.lock.Lock()
|
||||
defer bq.lock.Unlock()
|
||||
|
||||
return bq.podBackoffQ.Pop()
|
||||
}
|
||||
|
||||
// get returns the pInfo matching given pInfoLookup, if exists.
|
||||
func (bq *backoffQueue) get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
|
||||
bq.lock.RLock()
|
||||
defer bq.lock.RUnlock()
|
||||
|
||||
pInfo, exists := bq.podBackoffQ.Get(pInfoLookup)
|
||||
if exists {
|
||||
return pInfo, true
|
||||
}
|
||||
return bq.podErrorBackoffQ.Get(pInfoLookup)
|
||||
}
|
||||
|
||||
// has inform if pInfo exists in the queue.
|
||||
func (bq *backoffQueue) has(pInfo *framework.QueuedPodInfo) bool {
|
||||
bq.lock.RLock()
|
||||
defer bq.lock.RUnlock()
|
||||
|
||||
return bq.podBackoffQ.Has(pInfo) || bq.podErrorBackoffQ.Has(pInfo)
|
||||
}
|
||||
|
||||
// list returns all pods that are in the queue.
|
||||
func (bq *backoffQueue) list() []*v1.Pod {
|
||||
bq.lock.RLock()
|
||||
defer bq.lock.RUnlock()
|
||||
|
||||
var result []*v1.Pod
|
||||
for _, pInfo := range bq.podBackoffQ.List() {
|
||||
result = append(result, pInfo.Pod)
|
||||
}
|
||||
for _, pInfo := range bq.podErrorBackoffQ.List() {
|
||||
result = append(result, pInfo.Pod)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// len returns length of the queue.
|
||||
func (bq *backoffQueue) len() int {
|
||||
bq.lock.RLock()
|
||||
defer bq.lock.RUnlock()
|
||||
|
||||
return bq.podBackoffQ.Len() + bq.podErrorBackoffQ.Len()
|
||||
}
|
||||
|
||||
// lenBackoff returns length of the podBackoffQ.
|
||||
func (bq *backoffQueue) lenBackoff() int {
|
||||
bq.lock.RLock()
|
||||
defer bq.lock.RUnlock()
|
||||
|
||||
return bq.podBackoffQ.Len()
|
||||
}
|
6
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/nominator.go
generated
vendored
6
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/nominator.go
generated
vendored
@ -35,10 +35,10 @@ import (
|
||||
type nominator struct {
|
||||
// nLock synchronizes all operations related to nominator.
|
||||
// It should not be used anywhere else.
|
||||
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock") after taking "nLock".
|
||||
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first,
|
||||
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock" or "backoffQueue.lock") after taking "nLock".
|
||||
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" and "backoffQueue.lock" first,
|
||||
// otherwise the nominator could end up in deadlock.
|
||||
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > nLock.
|
||||
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock = backoffQueue.lock > nLock.
|
||||
nLock sync.RWMutex
|
||||
|
||||
// podLister is used to verify if the given pod is alive.
|
||||
|
276
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/scheduling_queue.go
generated
vendored
276
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/backend/queue/scheduling_queue.go
generated
vendored
@ -132,6 +132,9 @@ type SchedulingQueue interface {
|
||||
PendingPods() ([]*v1.Pod, string)
|
||||
InFlightPods() []*v1.Pod
|
||||
PodsInActiveQ() []*v1.Pod
|
||||
// PodsInBackoffQ returns all the Pods in the backoffQ.
|
||||
PodsInBackoffQ() []*v1.Pod
|
||||
UnschedulablePods() []*v1.Pod
|
||||
}
|
||||
|
||||
// NewSchedulingQueue initializes a priority queue as a new scheduling queue.
|
||||
@ -155,24 +158,18 @@ type PriorityQueue struct {
|
||||
*nominator
|
||||
|
||||
stop chan struct{}
|
||||
clock clock.Clock
|
||||
clock clock.WithTicker
|
||||
|
||||
// lock takes precedence and should be taken first,
|
||||
// before any other locks in the queue (activeQueue.lock or nominator.nLock).
|
||||
// Correct locking order is: lock > activeQueue.lock > nominator.nLock.
|
||||
// before any other locks in the queue (activeQueue.lock or backoffQueue.lock or nominator.nLock).
|
||||
// Correct locking order is: lock > activeQueue.lock > backoffQueue.lock > nominator.nLock.
|
||||
lock sync.RWMutex
|
||||
|
||||
// pod initial backoff duration.
|
||||
podInitialBackoffDuration time.Duration
|
||||
// pod maximum backoff duration.
|
||||
podMaxBackoffDuration time.Duration
|
||||
// the maximum time a pod can stay in the unschedulablePods.
|
||||
podMaxInUnschedulablePodsDuration time.Duration
|
||||
|
||||
activeQ activeQueuer
|
||||
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
|
||||
// are popped from this heap before the scheduler looks at activeQ
|
||||
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
|
||||
activeQ activeQueuer
|
||||
backoffQ backoffQueuer
|
||||
// unschedulablePods holds pods that have been tried and determined unschedulable.
|
||||
unschedulablePods *UnschedulablePods
|
||||
// moveRequestCycle caches the sequence number of scheduling cycle when we
|
||||
@ -195,6 +192,8 @@ type PriorityQueue struct {
|
||||
|
||||
// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
|
||||
isSchedulingQueueHintEnabled bool
|
||||
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
|
||||
isPopFromBackoffQEnabled bool
|
||||
}
|
||||
|
||||
// QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName.
|
||||
@ -213,7 +212,7 @@ type clusterEvent struct {
|
||||
}
|
||||
|
||||
type priorityQueueOptions struct {
|
||||
clock clock.Clock
|
||||
clock clock.WithTicker
|
||||
podInitialBackoffDuration time.Duration
|
||||
podMaxBackoffDuration time.Duration
|
||||
podMaxInUnschedulablePodsDuration time.Duration
|
||||
@ -228,7 +227,7 @@ type priorityQueueOptions struct {
|
||||
type Option func(*priorityQueueOptions)
|
||||
|
||||
// WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
|
||||
func WithClock(clock clock.Clock) Option {
|
||||
func WithClock(clock clock.WithTicker) Option {
|
||||
return func(o *priorityQueueOptions) {
|
||||
o.clock = clock
|
||||
}
|
||||
@ -331,14 +330,14 @@ func NewPriorityQueue(
|
||||
}
|
||||
|
||||
isSchedulingQueueHintEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints)
|
||||
isPopFromBackoffQEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerPopFromBackoffQ)
|
||||
|
||||
backoffQ := newBackoffQueue(options.clock, options.podInitialBackoffDuration, options.podMaxBackoffDuration, lessFn, isPopFromBackoffQEnabled)
|
||||
pq := &PriorityQueue{
|
||||
clock: options.clock,
|
||||
stop: make(chan struct{}),
|
||||
podInitialBackoffDuration: options.podInitialBackoffDuration,
|
||||
podMaxBackoffDuration: options.podMaxBackoffDuration,
|
||||
podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration,
|
||||
activeQ: newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder),
|
||||
backoffQ: backoffQ,
|
||||
unschedulablePods: newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()),
|
||||
preEnqueuePluginMap: options.preEnqueuePluginMap,
|
||||
queueingHintMap: options.queueingHintMap,
|
||||
@ -346,19 +345,24 @@ func NewPriorityQueue(
|
||||
pluginMetricsSamplePercent: options.pluginMetricsSamplePercent,
|
||||
moveRequestCycle: -1,
|
||||
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
|
||||
isPopFromBackoffQEnabled: isPopFromBackoffQEnabled,
|
||||
}
|
||||
pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder())
|
||||
var backoffQPopper backoffQPopper
|
||||
if isPopFromBackoffQEnabled {
|
||||
backoffQPopper = backoffQ
|
||||
}
|
||||
pq.activeQ = newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder, backoffQPopper)
|
||||
pq.nsLister = informerFactory.Core().V1().Namespaces().Lister()
|
||||
pq.nominator = newPodNominator(options.podLister)
|
||||
|
||||
return pq
|
||||
}
|
||||
|
||||
// Run starts the goroutine to pump from podBackoffQ to activeQ
|
||||
// Run starts the goroutine to pump from backoffQ to activeQ
|
||||
func (p *PriorityQueue) Run(logger klog.Logger) {
|
||||
go wait.Until(func() {
|
||||
go p.backoffQ.waitUntilAlignedWithOrderingWindow(func() {
|
||||
p.flushBackoffQCompleted(logger)
|
||||
}, 1.0*time.Second, p.stop)
|
||||
}, p.stop)
|
||||
go wait.Until(func() {
|
||||
p.flushUnschedulablePodsLeftover(logger)
|
||||
}, 30*time.Second, p.stop)
|
||||
@ -553,25 +557,33 @@ func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.Pr
|
||||
return s
|
||||
}
|
||||
|
||||
// moveToActiveQ tries to add pod to active queue and remove it from unschedulable and backoff queues.
|
||||
// It returns 2 parameters:
|
||||
// 1. a boolean flag to indicate whether the pod is added successfully.
|
||||
// 2. an error for the caller to act on.
|
||||
// moveToActiveQ tries to add the pod to the active queue.
|
||||
// If the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
|
||||
// It returns a boolean flag to indicate whether the pod is added successfully.
|
||||
func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
|
||||
gatedBefore := pInfo.Gated
|
||||
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
|
||||
// If SchedulerPopFromBackoffQ feature gate is enabled,
|
||||
// PreEnqueue plugins were called when the pod was added to the backoffQ.
|
||||
// Don't need to repeat it here when the pod is directly moved from the backoffQ.
|
||||
if !p.isPopFromBackoffQEnabled || event != framework.BackoffComplete {
|
||||
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
|
||||
}
|
||||
|
||||
added := false
|
||||
p.activeQ.underLock(func(unlockedActiveQ unlockedActiveQueuer) {
|
||||
if pInfo.Gated {
|
||||
// Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins.
|
||||
if unlockedActiveQ.Has(pInfo) {
|
||||
if unlockedActiveQ.has(pInfo) {
|
||||
return
|
||||
}
|
||||
if p.podBackoffQ.Has(pInfo) {
|
||||
if p.backoffQ.has(pInfo) {
|
||||
return
|
||||
}
|
||||
p.unschedulablePods.addOrUpdate(pInfo)
|
||||
if p.unschedulablePods.get(pInfo.Pod) != nil {
|
||||
return
|
||||
}
|
||||
p.unschedulablePods.addOrUpdate(pInfo, event)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue, because the pod is gated", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
|
||||
return
|
||||
}
|
||||
if pInfo.InitialAttemptTimestamp == nil {
|
||||
@ -579,13 +591,12 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
|
||||
pInfo.InitialAttemptTimestamp = &now
|
||||
}
|
||||
|
||||
unlockedActiveQ.AddOrUpdate(pInfo)
|
||||
unlockedActiveQ.add(pInfo, event)
|
||||
added = true
|
||||
|
||||
p.unschedulablePods.delete(pInfo.Pod, gatedBefore)
|
||||
_ = p.podBackoffQ.Delete(pInfo) // Don't need to react when pInfo is not found.
|
||||
p.backoffQ.delete(pInfo)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", activeQ)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
|
||||
if event == framework.EventUnscheduledPodAdd.Label() || event == framework.EventUnscheduledPodUpdate.Label() {
|
||||
p.AddNominatedPod(logger, pInfo.PodInfo, nil)
|
||||
}
|
||||
@ -593,6 +604,28 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
|
||||
return added
|
||||
}
|
||||
|
||||
// moveToBackoffQ tries to add the pod to the backoff queue.
|
||||
// If SchedulerPopFromBackoffQ feature gate is enabled and the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
|
||||
// It returns a boolean flag to indicate whether the pod is added successfully.
|
||||
func (p *PriorityQueue) moveToBackoffQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
|
||||
// If SchedulerPopFromBackoffQ feature gate is enabled,
|
||||
// PreEnqueue plugins are called on inserting pods to the backoffQ,
|
||||
// not to call them again on popping out.
|
||||
if p.isPopFromBackoffQEnabled {
|
||||
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
|
||||
if pInfo.Gated {
|
||||
if p.unschedulablePods.get(pInfo.Pod) == nil {
|
||||
p.unschedulablePods.addOrUpdate(pInfo, event)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
p.backoffQ.add(logger, pInfo, event)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", backoffQ)
|
||||
return true
|
||||
}
|
||||
|
||||
// Add adds a pod to the active queue. It should be called only when a new pod
|
||||
// is added so there is no chance the pod is already in active/unschedulable/backoff queues
|
||||
func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) {
|
||||
@ -641,10 +674,16 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
|
||||
// If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it.
|
||||
// The pod can be already in activeQ.
|
||||
var exists bool
|
||||
pInfo, exists = p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod))
|
||||
pInfo, exists = p.backoffQ.get(newQueuedPodInfoForLookup(pod))
|
||||
if !exists {
|
||||
return false
|
||||
}
|
||||
// Delete pod from the backoffQ now to make sure it won't be popped from the backoffQ
|
||||
// just before moving it to the activeQ
|
||||
if deleted := p.backoffQ.delete(pInfo); !deleted {
|
||||
// Pod was popped from the backoffQ in the meantime. Don't activate it.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if pInfo == nil {
|
||||
@ -656,13 +695,6 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
|
||||
return p.moveToActiveQ(logger, pInfo, framework.ForceActivate)
|
||||
}
|
||||
|
||||
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
|
||||
// If this returns true, the pod should not be re-tried.
|
||||
func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
|
||||
boTime := p.getBackoffTime(podInfo)
|
||||
return boTime.After(p.clock.Now())
|
||||
}
|
||||
|
||||
// SchedulingCycle returns current scheduling cycle.
|
||||
func (p *PriorityQueue) SchedulingCycle() int64 {
|
||||
return p.activeQ.schedulingCycle()
|
||||
@ -712,7 +744,7 @@ func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger
|
||||
// addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into
|
||||
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
|
||||
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
|
||||
// request, then the pod is put in `podBackoffQ`.
|
||||
// request, then the pod is put in `backoffQ`.
|
||||
// TODO: This function is called only when p.isSchedulingQueueHintEnabled is false,
|
||||
// and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
|
||||
func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
|
||||
@ -736,13 +768,14 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
|
||||
// - No unschedulable plugins are associated with this Pod,
|
||||
// meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
|
||||
// In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
|
||||
p.podBackoffQ.AddOrUpdate(pInfo)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", backoffQ)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", framework.ScheduleAttemptFailure).Inc()
|
||||
if added := p.moveToBackoffQ(logger, pInfo, framework.ScheduleAttemptFailure); added {
|
||||
if p.isPopFromBackoffQEnabled {
|
||||
p.activeQ.broadcast()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
p.unschedulablePods.addOrUpdate(pInfo)
|
||||
p.unschedulablePods.addOrUpdate(pInfo, framework.ScheduleAttemptFailure)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", unschedulablePods)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -751,7 +784,7 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
|
||||
// AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into
|
||||
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
|
||||
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
|
||||
// request, then the pod is put in `podBackoffQ`.
|
||||
// request, then the pod is put in `backoffQ`.
|
||||
func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
|
||||
p.lock.Lock()
|
||||
defer p.lock.Unlock()
|
||||
@ -767,7 +800,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
|
||||
if p.activeQ.has(pInfo) {
|
||||
return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
|
||||
}
|
||||
if p.podBackoffQ.Has(pInfo) {
|
||||
if p.backoffQ.has(pInfo) {
|
||||
return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
|
||||
}
|
||||
|
||||
@ -792,7 +825,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
|
||||
// In this case, we try to requeue this Pod to activeQ/backoffQ.
|
||||
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, framework.ScheduleAttemptFailure)
|
||||
logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins)
|
||||
if queue == activeQ {
|
||||
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
|
||||
// When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out.
|
||||
p.activeQ.broadcast()
|
||||
}
|
||||
@ -805,25 +838,12 @@ func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) {
|
||||
p.lock.Lock()
|
||||
defer p.lock.Unlock()
|
||||
activated := false
|
||||
for {
|
||||
pInfo, ok := p.podBackoffQ.Peek()
|
||||
if !ok || pInfo == nil {
|
||||
break
|
||||
}
|
||||
pod := pInfo.Pod
|
||||
if p.isPodBackingoff(pInfo) {
|
||||
break
|
||||
}
|
||||
_, err := p.podBackoffQ.Pop()
|
||||
if err != nil {
|
||||
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
|
||||
break
|
||||
}
|
||||
podsCompletedBackoff := p.backoffQ.popAllBackoffCompleted(logger)
|
||||
for _, pInfo := range podsCompletedBackoff {
|
||||
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
|
||||
activated = true
|
||||
}
|
||||
}
|
||||
|
||||
if activated {
|
||||
p.activeQ.broadcast()
|
||||
}
|
||||
@ -928,10 +948,8 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
|
||||
}
|
||||
|
||||
// If the pod is in the backoff queue, update it there.
|
||||
if pInfo, exists := p.podBackoffQ.Get(oldPodInfo); exists {
|
||||
_ = pInfo.Update(newPod)
|
||||
if pInfo := p.backoffQ.update(newPod, oldPodInfo); pInfo != nil {
|
||||
p.UpdateNominatedPod(logger, oldPod, pInfo.PodInfo)
|
||||
p.podBackoffQ.AddOrUpdate(pInfo)
|
||||
return
|
||||
}
|
||||
}
|
||||
@ -953,7 +971,7 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue because the Pod is updated", "pod", klog.KObj(newPod), "event", evt.Label(), "queue", queue)
|
||||
p.unschedulablePods.delete(pInfo.Pod, gated)
|
||||
}
|
||||
if queue == activeQ {
|
||||
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
|
||||
p.activeQ.broadcast()
|
||||
break
|
||||
}
|
||||
@ -961,21 +979,26 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
|
||||
return
|
||||
}
|
||||
if isPodUpdated(oldPod, newPod) {
|
||||
if p.isPodBackingoff(pInfo) {
|
||||
p.podBackoffQ.AddOrUpdate(pInfo)
|
||||
p.unschedulablePods.delete(pInfo.Pod, gated)
|
||||
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", framework.EventUnscheduledPodUpdate.Label(), "queue", backoffQ)
|
||||
// Pod might have completed its backoff time while being in unschedulablePods,
|
||||
// so we should check isPodBackingoff before moving the pod to backoffQ.
|
||||
if p.backoffQ.isPodBackingoff(pInfo) {
|
||||
if added := p.moveToBackoffQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
|
||||
p.unschedulablePods.delete(pInfo.Pod, gated)
|
||||
if p.isPopFromBackoffQEnabled {
|
||||
p.activeQ.broadcast()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
|
||||
if added := p.moveToActiveQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
|
||||
p.activeQ.broadcast()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Pod update didn't make it schedulable, keep it in the unschedulable queue.
|
||||
p.unschedulablePods.addOrUpdate(pInfo)
|
||||
p.unschedulablePods.addOrUpdate(pInfo, framework.EventUnscheduledPodUpdate.Label())
|
||||
return
|
||||
}
|
||||
// If pod is not in any of the queues, we put it in the active queue.
|
||||
@ -992,12 +1015,14 @@ func (p *PriorityQueue) Delete(pod *v1.Pod) {
|
||||
defer p.lock.Unlock()
|
||||
p.DeleteNominatedPodIfExists(pod)
|
||||
pInfo := newQueuedPodInfoForLookup(pod)
|
||||
if err := p.activeQ.delete(pInfo); err != nil {
|
||||
// The item was probably not found in the activeQ.
|
||||
p.podBackoffQ.Delete(pInfo)
|
||||
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
|
||||
p.unschedulablePods.delete(pod, pInfo.Gated)
|
||||
}
|
||||
if err := p.activeQ.delete(pInfo); err == nil {
|
||||
return
|
||||
}
|
||||
if deleted := p.backoffQ.delete(pInfo); deleted {
|
||||
return
|
||||
}
|
||||
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
|
||||
p.unschedulablePods.delete(pod, pInfo.Gated)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1065,28 +1090,24 @@ func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event
|
||||
// NOTE: this function assumes lock has been acquired in caller
|
||||
func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string {
|
||||
if strategy == queueSkip {
|
||||
p.unschedulablePods.addOrUpdate(pInfo)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
|
||||
p.unschedulablePods.addOrUpdate(pInfo, event)
|
||||
return unschedulablePods
|
||||
}
|
||||
|
||||
if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) {
|
||||
p.podBackoffQ.AddOrUpdate(pInfo)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
|
||||
return backoffQ
|
||||
// Pod might have completed its backoff time while being in unschedulablePods,
|
||||
// so we should check isPodBackingoff before moving the pod to backoffQ.
|
||||
if strategy == queueAfterBackoff && p.backoffQ.isPodBackingoff(pInfo) {
|
||||
if added := p.moveToBackoffQ(logger, pInfo, event); added {
|
||||
return backoffQ
|
||||
}
|
||||
return unschedulablePods
|
||||
}
|
||||
|
||||
// Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off.
|
||||
if added := p.moveToActiveQ(logger, pInfo, event); added {
|
||||
return activeQ
|
||||
}
|
||||
if pInfo.Gated {
|
||||
// In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in moveToActiveQ.
|
||||
return unschedulablePods
|
||||
}
|
||||
|
||||
p.unschedulablePods.addOrUpdate(pInfo)
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
|
||||
// Pod is gated. We don't have to push it back to unschedulable queue, because moveToActiveQ should already have done that.
|
||||
return unschedulablePods
|
||||
}
|
||||
|
||||
@ -1128,7 +1149,7 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn
|
||||
p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated)
|
||||
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label())
|
||||
logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label(), "queue", queue, "hint", schedulingHint)
|
||||
if queue == activeQ {
|
||||
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
|
||||
activated = true
|
||||
}
|
||||
}
|
||||
@ -1180,6 +1201,20 @@ func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
|
||||
return p.activeQ.list()
|
||||
}
|
||||
|
||||
// PodsInBackoffQ returns all the Pods in the backoffQ.
|
||||
func (p *PriorityQueue) PodsInBackoffQ() []*v1.Pod {
|
||||
return p.backoffQ.list()
|
||||
}
|
||||
|
||||
// UnschedulablePods returns all the pods in unschedulable state.
|
||||
func (p *PriorityQueue) UnschedulablePods() []*v1.Pod {
|
||||
var result []*v1.Pod
|
||||
for _, pInfo := range p.unschedulablePods.podInfoMap {
|
||||
result = append(result, pInfo.Pod)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
|
||||
|
||||
// GetPod searches for a pod in the activeQ, backoffQ, and unschedulablePods.
|
||||
@ -1197,7 +1232,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
|
||||
},
|
||||
},
|
||||
}
|
||||
if pInfo, ok = p.podBackoffQ.Get(pInfoLookup); ok {
|
||||
if pInfo, ok = p.backoffQ.get(pInfoLookup); ok {
|
||||
return pInfo, true
|
||||
}
|
||||
if pInfo = p.unschedulablePods.get(pInfoLookup.Pod); pInfo != nil {
|
||||
@ -1205,7 +1240,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
|
||||
}
|
||||
|
||||
p.activeQ.underRLock(func(unlockedActiveQ unlockedActiveQueueReader) {
|
||||
pInfo, ok = unlockedActiveQ.Get(pInfoLookup)
|
||||
pInfo, ok = unlockedActiveQ.get(pInfoLookup)
|
||||
})
|
||||
return
|
||||
}
|
||||
@ -1216,15 +1251,15 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
|
||||
func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) {
|
||||
p.lock.RLock()
|
||||
defer p.lock.RUnlock()
|
||||
result := p.activeQ.list()
|
||||
result := p.PodsInActiveQ()
|
||||
activeQLen := len(result)
|
||||
for _, pInfo := range p.podBackoffQ.List() {
|
||||
result = append(result, pInfo.Pod)
|
||||
}
|
||||
backoffQPods := p.PodsInBackoffQ()
|
||||
backoffQLen := len(backoffQPods)
|
||||
result = append(result, backoffQPods...)
|
||||
for _, pInfo := range p.unschedulablePods.podInfoMap {
|
||||
result = append(result, pInfo.Pod)
|
||||
}
|
||||
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap))
|
||||
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, backoffQLen, len(p.unschedulablePods.podInfoMap))
|
||||
}
|
||||
|
||||
// Note: this function assumes the caller locks both p.lock.RLock and p.activeQ.getLock().RLock.
|
||||
@ -1232,7 +1267,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
|
||||
pod := np.toPod()
|
||||
pInfoLookup := newQueuedPodInfoForLookup(pod)
|
||||
|
||||
queuedPodInfo, exists := unlockedActiveQ.Get(pInfoLookup)
|
||||
queuedPodInfo, exists := unlockedActiveQ.get(pInfoLookup)
|
||||
if exists {
|
||||
return queuedPodInfo.PodInfo
|
||||
}
|
||||
@ -1242,7 +1277,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
|
||||
return queuedPodInfo.PodInfo
|
||||
}
|
||||
|
||||
queuedPodInfo, exists = p.podBackoffQ.Get(pInfoLookup)
|
||||
queuedPodInfo, exists = p.backoffQ.get(pInfoLookup)
|
||||
if exists {
|
||||
return queuedPodInfo.PodInfo
|
||||
}
|
||||
@ -1276,12 +1311,6 @@ func (p *PriorityQueue) NominatedPodsForNode(nodeName string) []*framework.PodIn
|
||||
return pods
|
||||
}
|
||||
|
||||
func (p *PriorityQueue) podsCompareBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
|
||||
bo1 := p.getBackoffTime(pInfo1)
|
||||
bo2 := p.getBackoffTime(pInfo2)
|
||||
return bo1.Before(bo2)
|
||||
}
|
||||
|
||||
// newQueuedPodInfo builds a QueuedPodInfo object.
|
||||
func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
|
||||
now := p.clock.Now()
|
||||
@ -1296,33 +1325,6 @@ func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framew
|
||||
}
|
||||
}
|
||||
|
||||
// getBackoffTime returns the time that podInfo completes backoff
|
||||
func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
|
||||
duration := p.calculateBackoffDuration(podInfo)
|
||||
backoffTime := podInfo.Timestamp.Add(duration)
|
||||
return backoffTime
|
||||
}
|
||||
|
||||
// calculateBackoffDuration is a helper function for calculating the backoffDuration
|
||||
// based on the number of attempts the pod has made.
|
||||
func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
|
||||
if podInfo.Attempts == 0 {
|
||||
// When the Pod hasn't experienced any scheduling attempts,
|
||||
// they aren't obliged to get a backoff penalty at all.
|
||||
return 0
|
||||
}
|
||||
|
||||
duration := p.podInitialBackoffDuration
|
||||
for i := 1; i < podInfo.Attempts; i++ {
|
||||
// Use subtraction instead of addition or multiplication to avoid overflow.
|
||||
if duration > p.podMaxBackoffDuration-duration {
|
||||
return p.podMaxBackoffDuration
|
||||
}
|
||||
duration += duration
|
||||
}
|
||||
return duration
|
||||
}
|
||||
|
||||
// UnschedulablePods holds pods that cannot be scheduled. This data structure
|
||||
// is used to implement unschedulablePods.
|
||||
type UnschedulablePods struct {
|
||||
@ -1335,7 +1337,8 @@ type UnschedulablePods struct {
|
||||
}
|
||||
|
||||
// addOrUpdate adds a pod to the unschedulable podInfoMap.
|
||||
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
|
||||
// The event should show which event triggered the addition and is used for the metric recording.
|
||||
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo, event string) {
|
||||
podID := u.keyFunc(pInfo.Pod)
|
||||
if _, exists := u.podInfoMap[podID]; !exists {
|
||||
if pInfo.Gated && u.gatedRecorder != nil {
|
||||
@ -1343,6 +1346,7 @@ func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
|
||||
} else if !pInfo.Gated && u.unschedulableRecorder != nil {
|
||||
u.unschedulableRecorder.Inc()
|
||||
}
|
||||
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
|
||||
}
|
||||
u.podInfoMap[podID] = pInfo
|
||||
}
|
||||
|
27
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/eventhandlers.go
generated
vendored
27
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/eventhandlers.go
generated
vendored
@ -33,6 +33,7 @@ import (
|
||||
"k8s.io/client-go/tools/cache"
|
||||
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
|
||||
corev1nodeaffinity "k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
|
||||
@ -160,6 +161,16 @@ func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) {
|
||||
|
||||
logger.V(4).Info("Update event for unscheduled pod", "pod", klog.KObj(newPod))
|
||||
sched.SchedulingQueue.Update(logger, oldPod, newPod)
|
||||
if hasNominatedNodeNameChanged(oldPod, newPod) {
|
||||
// Nominated node changed in pod, so we need to treat it as if the pod was deleted from the old nominated node,
|
||||
// because the scheduler treats such a pod as if it was already assigned when scheduling lower or equal priority pods.
|
||||
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, oldPod, nil, getLEPriorityPreCheck(corev1helpers.PodPriority(oldPod)))
|
||||
}
|
||||
}
|
||||
|
||||
// hasNominatedNodeNameChanged returns true when nominated node name has existed but changed.
|
||||
func hasNominatedNodeNameChanged(oldPod, newPod *v1.Pod) bool {
|
||||
return len(oldPod.Status.NominatedNodeName) > 0 && oldPod.Status.NominatedNodeName != newPod.Status.NominatedNodeName
|
||||
}
|
||||
|
||||
func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) {
|
||||
@ -195,8 +206,21 @@ func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) {
|
||||
// If a waiting pod is rejected, it indicates it's previously assumed and we're
|
||||
// removing it from the scheduler cache. In this case, signal a AssignedPodDelete
|
||||
// event to immediately retry some unscheduled Pods.
|
||||
// Similarly when a pod that had nominated node is deleted, it can unblock scheduling of other pods,
|
||||
// because the lower or equal priority pods treat such a pod as if it was assigned.
|
||||
if fwk.RejectWaitingPod(pod.UID) {
|
||||
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, nil)
|
||||
} else if pod.Status.NominatedNodeName != "" {
|
||||
// Note that a nominated pod can fall into `RejectWaitingPod` case as well,
|
||||
// but in that case the `MoveAllToActiveOrBackoffQueue` already covered lower priority pods.
|
||||
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, getLEPriorityPreCheck(corev1helpers.PodPriority(pod)))
|
||||
}
|
||||
}
|
||||
|
||||
// getLEPriorityPreCheck is a PreEnqueueCheck function that selects only lower or equal priority pods.
|
||||
func getLEPriorityPreCheck(priority int32) queue.PreEnqueueCheck {
|
||||
return func(pod *v1.Pod) bool {
|
||||
return corev1helpers.PodPriority(pod) <= priority
|
||||
}
|
||||
}
|
||||
|
||||
@ -343,6 +367,7 @@ func addAllEventHandlers(
|
||||
informerFactory informers.SharedInformerFactory,
|
||||
dynInformerFactory dynamicinformer.DynamicSharedInformerFactory,
|
||||
resourceClaimCache *assumecache.AssumeCache,
|
||||
resourceSliceTracker *resourceslicetracker.Tracker,
|
||||
gvkMap map[framework.EventResource]framework.ActionType,
|
||||
) error {
|
||||
var (
|
||||
@ -532,7 +557,7 @@ func addAllEventHandlers(
|
||||
}
|
||||
case framework.ResourceSlice:
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
|
||||
if handlerRegistration, err = informerFactory.Resource().V1beta1().ResourceSlices().Informer().AddEventHandler(
|
||||
if handlerRegistration, err = resourceSliceTracker.AddEventHandler(
|
||||
buildEvtResHandler(at, framework.ResourceSlice),
|
||||
); err != nil {
|
||||
return err
|
||||
|
4
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/events.go
generated
vendored
4
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/events.go
generated
vendored
@ -31,6 +31,8 @@ const (
|
||||
ScheduleAttemptFailure = "ScheduleAttemptFailure"
|
||||
// BackoffComplete is the event when a pod finishes backoff.
|
||||
BackoffComplete = "BackoffComplete"
|
||||
// PopFromBackoffQ is the event when a pod is popped from backoffQ when activeQ is empty.
|
||||
PopFromBackoffQ = "PopFromBackoffQ"
|
||||
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
|
||||
// to activeQ. Usually it's triggered by plugin implementations.
|
||||
ForceActivate = "ForceActivate"
|
||||
@ -130,7 +132,7 @@ func extractPodTolerationChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
|
||||
// Due to API validation, the user can add, but cannot modify or remove tolerations.
|
||||
// So, it's enough to just check the length of tolerations to notice the update.
|
||||
// And, any updates in tolerations could make Pod schedulable.
|
||||
return UpdatePodTolerations
|
||||
return UpdatePodToleration
|
||||
}
|
||||
|
||||
return none
|
||||
|
10
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/interface.go
generated
vendored
10
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/interface.go
generated
vendored
@ -26,8 +26,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/google/go-cmp/cmp" //nolint:depguard
|
||||
"github.com/google/go-cmp/cmp/cmpopts" //nolint:depguard
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
@ -227,8 +227,8 @@ const (
|
||||
// Pending means that the scheduling process is finished successfully,
|
||||
// but the plugin wants to stop the scheduling cycle/binding cycle here.
|
||||
//
|
||||
// For example, the DRA plugin sometimes needs to wait for the external device driver
|
||||
// to provision the resource for the Pod.
|
||||
// For example, if your plugin has to notify the scheduling result to an external component,
|
||||
// and wait for it to complete something **before** binding.
|
||||
// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
|
||||
// because in this case, the scheduler decides where the Pod can go successfully,
|
||||
// but we need to wait for the external component to do something based on that scheduling result.
|
||||
@ -609,7 +609,7 @@ type ScorePlugin interface {
|
||||
// Score is called on each filtered node. It must return success and an integer
|
||||
// indicating the rank of the node. All scoring plugins must return success or
|
||||
// the pod will be rejected.
|
||||
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
|
||||
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeInfo *NodeInfo) (int64, *Status)
|
||||
|
||||
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
|
||||
ScoreExtensions() ScoreExtensions
|
||||
|
9
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/listers.go
generated
vendored
9
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/listers.go
generated
vendored
@ -50,8 +50,13 @@ type SharedLister interface {
|
||||
|
||||
// ResourceSliceLister can be used to obtain ResourceSlices.
|
||||
type ResourceSliceLister interface {
|
||||
// List returns a list of all ResourceSlices.
|
||||
List() ([]*resourceapi.ResourceSlice, error)
|
||||
// ListWithDeviceTaintRules returns a list of all ResourceSlices with DeviceTaintRules applied
|
||||
// if the DRADeviceTaints feature is enabled, otherwise without them.
|
||||
//
|
||||
// k8s.io/dynamic-resource-allocation/resourceslice/tracker provides an implementation
|
||||
// of the necessary logic. That tracker can be instantiated as a replacement for
|
||||
// a normal ResourceSlice informer and provides a ListPatchedResourceSlices method.
|
||||
ListWithDeviceTaintRules() ([]*resourceapi.ResourceSlice, error)
|
||||
}
|
||||
|
||||
// DeviceClassLister can be used to obtain DeviceClasses.
|
||||
|
27
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/parallelism.go
generated
vendored
27
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/parallelize/parallelism.go
generated
vendored
@ -51,15 +51,28 @@ func chunkSizeFor(n, parallelism int) int {
|
||||
return s
|
||||
}
|
||||
|
||||
// numWorkersForChunkSize returns number of workers (goroutines)
|
||||
// that will be created in workqueue.ParallelizeUntil
|
||||
// for given parallelism, pieces and chunkSize values.
|
||||
func numWorkersForChunkSize(parallelism, pieces, chunkSize int) int {
|
||||
chunks := (pieces + chunkSize - 1) / chunkSize
|
||||
if chunks < parallelism {
|
||||
return chunks
|
||||
}
|
||||
return parallelism
|
||||
}
|
||||
|
||||
// Until is a wrapper around workqueue.ParallelizeUntil to use in scheduling algorithms.
|
||||
// A given operation will be a label that is recorded in the goroutine metric.
|
||||
func (p Parallelizer) Until(ctx context.Context, pieces int, doWorkPiece workqueue.DoWorkPieceFunc, operation string) {
|
||||
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
|
||||
withMetrics := func(piece int) {
|
||||
goroutinesMetric.Inc()
|
||||
doWorkPiece(piece)
|
||||
goroutinesMetric.Dec()
|
||||
}
|
||||
chunkSize := chunkSizeFor(pieces, p.parallelism)
|
||||
workers := numWorkersForChunkSize(p.parallelism, pieces, chunkSize)
|
||||
|
||||
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, withMetrics, workqueue.WithChunkSize(chunkSizeFor(pieces, p.parallelism)))
|
||||
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
|
||||
// Calling single Add with workers' count is more efficient than calling Inc or Dec per each work piece.
|
||||
// This approach improves performance of some plugins (affinity, topology spreading) as well as preemption.
|
||||
goroutinesMetric.Add(float64(workers))
|
||||
defer goroutinesMetric.Add(float64(-workers))
|
||||
|
||||
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, doWorkPiece, workqueue.WithChunkSize(chunkSize))
|
||||
}
|
||||
|
@ -136,10 +136,14 @@ func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 {
|
||||
return n
|
||||
}
|
||||
|
||||
// getOffsetRand is a dedicated random source for GetOffsetAndNumCandidates calls.
|
||||
// It defaults to rand.Int31n, but is a package variable so it can be overridden to make unit tests deterministic.
|
||||
var getOffsetRand = rand.Int31n
|
||||
|
||||
// GetOffsetAndNumCandidates chooses a random offset and calculates the number
|
||||
// of candidates that should be shortlisted for dry running preemption.
|
||||
func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) {
|
||||
return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes)
|
||||
return getOffsetRand(numNodes), pl.calculateNumCandidates(numNodes)
|
||||
}
|
||||
|
||||
// This function is not applicable for out-of-tree preemption plugins that exercise
|
||||
|
@ -27,6 +27,7 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/informers"
|
||||
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
|
||||
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
|
||||
"k8s.io/dynamic-resource-allocation/structured"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
@ -44,8 +45,9 @@ type DefaultDRAManager struct {
|
||||
deviceClassLister *deviceClassLister
|
||||
}
|
||||
|
||||
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
|
||||
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, resourceSliceTracker *resourceslicetracker.Tracker, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
|
||||
logger := klog.FromContext(ctx)
|
||||
|
||||
manager := &DefaultDRAManager{
|
||||
resourceClaimTracker: &claimTracker{
|
||||
cache: claimsCache,
|
||||
@ -53,7 +55,7 @@ func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, in
|
||||
allocatedDevices: newAllocatedDevices(logger),
|
||||
logger: logger,
|
||||
},
|
||||
resourceSliceLister: &resourceSliceLister{sliceLister: informerFactory.Resource().V1beta1().ResourceSlices().Lister()},
|
||||
resourceSliceLister: &resourceSliceLister{tracker: resourceSliceTracker},
|
||||
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1beta1().DeviceClasses().Lister()},
|
||||
}
|
||||
|
||||
@ -79,11 +81,11 @@ func (s *DefaultDRAManager) DeviceClasses() framework.DeviceClassLister {
|
||||
var _ framework.ResourceSliceLister = &resourceSliceLister{}
|
||||
|
||||
type resourceSliceLister struct {
|
||||
sliceLister resourcelisters.ResourceSliceLister
|
||||
tracker *resourceslicetracker.Tracker
|
||||
}
|
||||
|
||||
func (l *resourceSliceLister) List() ([]*resourceapi.ResourceSlice, error) {
|
||||
return l.sliceLister.List(labels.Everything())
|
||||
func (l *resourceSliceLister) ListWithDeviceTaintRules() ([]*resourceapi.ResourceSlice, error) {
|
||||
return l.tracker.ListPatchedResourceSlices()
|
||||
}
|
||||
|
||||
var _ framework.DeviceClassLister = &deviceClassLister{}
|
||||
|
@ -21,9 +21,10 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp" //nolint:depguard
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
resourceapi "k8s.io/api/resource/v1beta1"
|
||||
@ -101,9 +102,12 @@ type informationForClaim struct {
|
||||
|
||||
// DynamicResources is a plugin that ensures that ResourceClaims are allocated.
|
||||
type DynamicResources struct {
|
||||
enabled bool
|
||||
enableAdminAccess bool
|
||||
enableSchedulingQueueHint bool
|
||||
enabled bool
|
||||
enableAdminAccess bool
|
||||
enablePrioritizedList bool
|
||||
enableSchedulingQueueHint bool
|
||||
enablePartitionableDevices bool
|
||||
enableDeviceTaints bool
|
||||
|
||||
fh framework.Handle
|
||||
clientset kubernetes.Interface
|
||||
@ -119,9 +123,12 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
|
||||
}
|
||||
|
||||
pl := &DynamicResources{
|
||||
enabled: true,
|
||||
enableAdminAccess: fts.EnableDRAAdminAccess,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
enabled: true,
|
||||
enableAdminAccess: fts.EnableDRAAdminAccess,
|
||||
enableDeviceTaints: fts.EnableDRADeviceTaints,
|
||||
enablePrioritizedList: fts.EnableDRAPrioritizedList,
|
||||
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
|
||||
enablePartitionableDevices: fts.EnablePartitionableDevices,
|
||||
|
||||
fh: fh,
|
||||
clientset: fh.ClientSet(),
|
||||
@ -176,7 +183,7 @@ func (pl *DynamicResources) EventsToRegister(_ context.Context) ([]framework.Clu
|
||||
// A pod might be waiting for a class to get created or modified.
|
||||
{Event: framework.ClusterEvent{Resource: framework.DeviceClass, ActionType: framework.Add | framework.Update}},
|
||||
// Adding or updating a ResourceSlice might make a pod schedulable because new resources became available.
|
||||
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterResourceSliceChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}},
|
||||
}
|
||||
|
||||
return events, nil
|
||||
@ -288,38 +295,6 @@ func (pl *DynamicResources) isSchedulableAfterPodChange(logger klog.Logger, pod
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// isSchedulableAfterResourceSliceChange is invoked for add and update slice events reported by
|
||||
// an informer. Such changes can make an unschedulable pod schedulable when the pod requests a device
|
||||
// and the change adds a suitable device.
|
||||
//
|
||||
// For the sake of faster execution and avoiding code duplication, isSchedulableAfterResourceSliceChange
|
||||
// only checks whether the pod uses claims. All of the more detailed checks are done in the scheduling
|
||||
// attempt.
|
||||
//
|
||||
// The delete claim event will not invoke it, so newObj will never be nil.
|
||||
func (pl *DynamicResources) isSchedulableAfterResourceSliceChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedSlice, err := schedutil.As[*resourceapi.ResourceSlice](oldObj, newObj)
|
||||
if err != nil {
|
||||
// Shouldn't happen.
|
||||
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterResourceSliceChange: %w", err)
|
||||
}
|
||||
|
||||
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
|
||||
// This is not an unexpected error: we know that
|
||||
// foreachPodResourceClaim only returns errors for "not
|
||||
// schedulable".
|
||||
logger.V(6).Info("pod is not schedulable after resource slice change", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice), "reason", err.Error())
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// We could check what got changed in the slice, but right now that's likely to be
|
||||
// about the spec (there's no status yet...).
|
||||
// We could check whether all claims use classic DRA, but that doesn't seem worth it.
|
||||
// Let's assume that changing the slice may make the pod schedulable.
|
||||
logger.V(5).Info("ResourceSlice change might make pod schedulable", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
|
||||
func (pl *DynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourceapi.ResourceClaim, error) {
|
||||
claims := make([]*resourceapi.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
|
||||
@ -437,20 +412,22 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
|
||||
// initial set of potential nodes before we ask the
|
||||
// driver(s) for information about the specific pod.
|
||||
for _, request := range claim.Spec.Devices.Requests {
|
||||
if request.DeviceClassName == "" {
|
||||
return nil, statusError(logger, fmt.Errorf("request %s: unsupported request type", request.Name))
|
||||
}
|
||||
|
||||
_, err := pl.draManager.DeviceClasses().Get(request.DeviceClassName)
|
||||
if err != nil {
|
||||
// If the class cannot be retrieved, allocation cannot proceed.
|
||||
if apierrors.IsNotFound(err) {
|
||||
// Here we mark the pod as "unschedulable", so it'll sleep in
|
||||
// the unscheduleable queue until a DeviceClass event occurs.
|
||||
return nil, statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", request.Name, request.DeviceClassName))
|
||||
// The requirements differ depending on whether the request has a list of
|
||||
// alternative subrequests defined in the firstAvailable field.
|
||||
if len(request.FirstAvailable) == 0 {
|
||||
if status := pl.validateDeviceClass(logger, request.DeviceClassName, request.Name); status != nil {
|
||||
return nil, status
|
||||
}
|
||||
} else {
|
||||
if !pl.enablePrioritizedList {
|
||||
return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s, request %s: has subrequests, but the DRAPrioritizedList feature is disabled", klog.KObj(claim), request.Name))
|
||||
}
|
||||
for _, subRequest := range request.FirstAvailable {
|
||||
qualRequestName := strings.Join([]string{request.Name, subRequest.Name}, "/")
|
||||
if status := pl.validateDeviceClass(logger, subRequest.DeviceClassName, qualRequestName); status != nil {
|
||||
return nil, status
|
||||
}
|
||||
}
|
||||
// Other error, retry with backoff.
|
||||
return nil, statusError(logger, fmt.Errorf("request %s: look up device class: %w", request.Name, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -475,11 +452,17 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
slices, err := pl.draManager.ResourceSlices().List()
|
||||
slices, err := pl.draManager.ResourceSlices().ListWithDeviceTaintRules()
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
allocator, err := structured.NewAllocator(ctx, pl.enableAdminAccess, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
|
||||
features := structured.Features{
|
||||
AdminAccess: pl.enableAdminAccess,
|
||||
PrioritizedList: pl.enablePrioritizedList,
|
||||
PartitionableDevices: pl.enablePartitionableDevices,
|
||||
DeviceTaints: pl.enableDeviceTaints,
|
||||
}
|
||||
allocator, err := structured.NewAllocator(ctx, features, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
|
||||
if err != nil {
|
||||
return nil, statusError(logger, err)
|
||||
}
|
||||
@ -491,6 +474,23 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (pl *DynamicResources) validateDeviceClass(logger klog.Logger, deviceClassName, requestName string) *framework.Status {
|
||||
if deviceClassName == "" {
|
||||
return statusError(logger, fmt.Errorf("request %s: unsupported request type", requestName))
|
||||
}
|
||||
|
||||
_, err := pl.draManager.DeviceClasses().Get(deviceClassName)
|
||||
if err != nil {
|
||||
// If the class cannot be retrieved, allocation cannot proceed.
|
||||
if apierrors.IsNotFound(err) {
|
||||
// Here we mark the pod as "unschedulable", so it'll sleep in
|
||||
// the unscheduleable queue until a DeviceClass event occurs.
|
||||
return statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", requestName, deviceClassName))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// PreFilterExtensions returns prefilter extensions, pod add and remove.
|
||||
func (pl *DynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
|
||||
return nil
|
||||
@ -608,6 +608,11 @@ func (pl *DynamicResources) PostFilter(ctx context.Context, cs *framework.CycleS
|
||||
if !pl.enabled {
|
||||
return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
|
||||
}
|
||||
// If a Pod doesn't have any resource claims attached to it, there is no need for further processing.
|
||||
// Thus we provide a fast path for this case to avoid unnecessary computations.
|
||||
if len(pod.Spec.ResourceClaims) == 0 {
|
||||
return nil, framework.NewStatus(framework.Unschedulable)
|
||||
}
|
||||
logger := klog.FromContext(ctx)
|
||||
state, err := getStateData(cs)
|
||||
if err != nil {
|
||||
|
7
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature/feature.go
generated
vendored
7
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature/feature.go
generated
vendored
@ -20,9 +20,12 @@ package feature
|
||||
// This struct allows us to break the dependency of the plugins on
|
||||
// the internal k8s features pkg.
|
||||
type Features struct {
|
||||
EnableDRAPrioritizedList bool
|
||||
EnableDRAAdminAccess bool
|
||||
EnableDRADeviceTaints bool
|
||||
EnableDynamicResourceAllocation bool
|
||||
EnableVolumeCapacityPriority bool
|
||||
EnableVolumeAttributesClass bool
|
||||
EnableCSIMigrationPortworx bool
|
||||
EnableNodeInclusionPolicyInPodTopologySpread bool
|
||||
EnableMatchLabelKeysInPodTopologySpread bool
|
||||
EnableInPlacePodVerticalScaling bool
|
||||
@ -30,4 +33,6 @@ type Features struct {
|
||||
EnableSchedulingQueueHint bool
|
||||
EnableAsyncPreemption bool
|
||||
EnablePodLevelResources bool
|
||||
EnablePartitionableDevices bool
|
||||
EnableStorageCapacityScoring bool
|
||||
}
|
||||
|
@ -18,7 +18,6 @@ package imagelocality
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@ -51,12 +50,7 @@ func (pl *ImageLocality) Name() string {
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
nodeInfos, err := pl.handle.SnapshotSharedLister().NodeInfos().List()
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
|
@ -89,15 +89,21 @@ type topologyPair struct {
|
||||
}
|
||||
type topologyToMatchedTermCount map[topologyPair]int64
|
||||
|
||||
func (m topologyToMatchedTermCount) append(toAppend topologyToMatchedTermCount) {
|
||||
for pair := range toAppend {
|
||||
m[pair] += toAppend[pair]
|
||||
func (m topologyToMatchedTermCount) merge(toMerge topologyToMatchedTermCount) {
|
||||
for pair, count := range toMerge {
|
||||
m[pair] += count
|
||||
}
|
||||
}
|
||||
|
||||
func (m topologyToMatchedTermCount) mergeWithList(toMerge topologyToMatchedTermCountList) {
|
||||
for _, tmtc := range toMerge {
|
||||
m[tmtc.topologyPair] += tmtc.count
|
||||
}
|
||||
}
|
||||
|
||||
func (m topologyToMatchedTermCount) clone() topologyToMatchedTermCount {
|
||||
copy := make(topologyToMatchedTermCount, len(m))
|
||||
copy.append(m)
|
||||
copy.merge(m)
|
||||
return copy
|
||||
}
|
||||
|
||||
@ -134,6 +140,48 @@ func (m topologyToMatchedTermCount) updateWithAntiAffinityTerms(terms []framewor
|
||||
}
|
||||
}
|
||||
|
||||
// topologyToMatchedTermCountList is a slice equivalent of topologyToMatchedTermCount map.
|
||||
// The use of slice improves the performance of PreFilter,
|
||||
// especially due to faster iteration when merging than with topologyToMatchedTermCount.
|
||||
type topologyToMatchedTermCountList []topologyPairCount
|
||||
|
||||
type topologyPairCount struct {
|
||||
topologyPair topologyPair
|
||||
count int64
|
||||
}
|
||||
|
||||
func (m *topologyToMatchedTermCountList) append(node *v1.Node, tk string, value int64) {
|
||||
if tv, ok := node.Labels[tk]; ok {
|
||||
pair := topologyPair{key: tk, value: tv}
|
||||
*m = append(*m, topologyPairCount{
|
||||
topologyPair: pair,
|
||||
count: value,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// appends the specified value to the topologyToMatchedTermCountList
|
||||
// for each affinity term if "targetPod" matches ALL terms.
|
||||
func (m *topologyToMatchedTermCountList) appendWithAffinityTerms(
|
||||
terms []framework.AffinityTerm, pod *v1.Pod, node *v1.Node, value int64) {
|
||||
if podMatchesAllAffinityTerms(terms, pod) {
|
||||
for _, t := range terms {
|
||||
m.append(node, t.TopologyKey, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// appends the specified value to the topologyToMatchedTermCountList
|
||||
// for each anti-affinity term matched the target pod.
|
||||
func (m *topologyToMatchedTermCountList) appendWithAntiAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, value int64) {
|
||||
// Check anti-affinity terms.
|
||||
for _, t := range terms {
|
||||
if t.Matches(pod, nsLabels) {
|
||||
m.append(node, t.TopologyKey, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns true IFF the given pod matches all the given terms.
|
||||
func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) bool {
|
||||
if len(terms) == 0 {
|
||||
@ -153,25 +201,26 @@ func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) boo
|
||||
// 1. Whether it has PodAntiAffinity
|
||||
// 2. Whether any AntiAffinityTerm matches the incoming pod
|
||||
func (pl *InterPodAffinity) getExistingAntiAffinityCounts(ctx context.Context, pod *v1.Pod, nsLabels labels.Set, nodes []*framework.NodeInfo) topologyToMatchedTermCount {
|
||||
topoMaps := make([]topologyToMatchedTermCount, len(nodes))
|
||||
antiAffinityCountsList := make([]topologyToMatchedTermCountList, len(nodes))
|
||||
index := int32(-1)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := nodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
topoMap := make(topologyToMatchedTermCount)
|
||||
antiAffinityCounts := make(topologyToMatchedTermCountList, 0)
|
||||
for _, existingPod := range nodeInfo.PodsWithRequiredAntiAffinity {
|
||||
topoMap.updateWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
|
||||
antiAffinityCounts.appendWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
|
||||
}
|
||||
if len(topoMap) != 0 {
|
||||
topoMaps[atomic.AddInt32(&index, 1)] = topoMap
|
||||
if len(antiAffinityCounts) != 0 {
|
||||
antiAffinityCountsList[atomic.AddInt32(&index, 1)] = antiAffinityCounts
|
||||
}
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(nodes), processNode, pl.Name())
|
||||
|
||||
result := make(topologyToMatchedTermCount)
|
||||
// Traditional for loop is slightly faster in this case than its "for range" equivalent.
|
||||
for i := 0; i <= int(index); i++ {
|
||||
result.append(topoMaps[i])
|
||||
result.mergeWithList(antiAffinityCountsList[i])
|
||||
}
|
||||
|
||||
return result
|
||||
@ -188,20 +237,20 @@ func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Co
|
||||
return affinityCounts, antiAffinityCounts
|
||||
}
|
||||
|
||||
affinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
|
||||
antiAffinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
|
||||
affinityCountsList := make([]topologyToMatchedTermCountList, len(allNodes))
|
||||
antiAffinityCountsList := make([]topologyToMatchedTermCountList, len(allNodes))
|
||||
index := int32(-1)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
affinity := make(topologyToMatchedTermCount)
|
||||
antiAffinity := make(topologyToMatchedTermCount)
|
||||
affinity := make(topologyToMatchedTermCountList, 0)
|
||||
antiAffinity := make(topologyToMatchedTermCountList, 0)
|
||||
for _, existingPod := range nodeInfo.Pods {
|
||||
affinity.updateWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
|
||||
affinity.appendWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
|
||||
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
|
||||
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
|
||||
antiAffinity.updateWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
|
||||
antiAffinity.appendWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
|
||||
}
|
||||
|
||||
if len(affinity) > 0 || len(antiAffinity) > 0 {
|
||||
@ -213,8 +262,8 @@ func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Co
|
||||
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
|
||||
|
||||
for i := 0; i <= int(index); i++ {
|
||||
affinityCounts.append(affinityCountsList[i])
|
||||
antiAffinityCounts.append(antiAffinityCountsList[i])
|
||||
affinityCounts.mergeWithList(affinityCountsList[i])
|
||||
antiAffinityCounts.mergeWithList(antiAffinityCountsList[i])
|
||||
}
|
||||
|
||||
return affinityCounts, antiAffinityCounts
|
||||
|
66
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go
generated
vendored
66
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/plugin.go
generated
vendored
@ -211,7 +211,7 @@ func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod
|
||||
}
|
||||
|
||||
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, err
|
||||
}
|
||||
@ -221,11 +221,35 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// When queuing this Pod:
|
||||
// - 1. A new node is added with the pod affinity topologyKey, the pod may become schedulable.
|
||||
// - 2. The original node does not have the pod affinity topologyKey but the modified node does, the pod may become schedulable.
|
||||
// - 3. Both the original and modified nodes have the pod affinity topologyKey and they differ, the pod may become schedulable.
|
||||
for _, term := range terms {
|
||||
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
|
||||
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
|
||||
if originalNode == nil {
|
||||
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
|
||||
// Case 1: A new node is added with the pod affinity topologyKey.
|
||||
logger.V(5).Info("A node with a matched pod affinity topologyKey was added and it may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
continue
|
||||
}
|
||||
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
|
||||
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
|
||||
|
||||
if !originalHasKey && modifiedHasKey {
|
||||
// Case 2: Original node does not have the pod affinity topologyKey, but the modified node does.
|
||||
logger.V(5).Info("A node got updated to have the topology key of pod affinity, which may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, err
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
|
||||
// Case 3: Both nodes have the pod affinity topologyKey, but the values differ.
|
||||
logger.V(5).Info("A node is moved to a different domain of pod affinity, which may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
@ -234,11 +258,39 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
|
||||
return framework.Queue, err
|
||||
}
|
||||
|
||||
// When queuing this Pod:
|
||||
// - 1. A new node is added, the pod may become schedulable.
|
||||
// - 2. The original node have the pod anti-affinity topologyKey but the modified node does not, the pod may become schedulable.
|
||||
// - 3. Both the original and modified nodes have the pod anti-affinity topologyKey and they differ, the pod may become schedulable.
|
||||
for _, term := range antiTerms {
|
||||
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
|
||||
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
|
||||
if originalNode == nil {
|
||||
// Case 1: A new node is added.
|
||||
// We always requeue the Pod with anti-affinity because:
|
||||
// - the node without the topology key is always allowed to have a Pod with anti-affinity.
|
||||
// - the addition of a node with the topology key makes Pods schedulable only when the topology it joins doesn't have any Pods that the Pod hates.
|
||||
// But, it's out-of-scope of this QHint to check which Pods are in the topology this Node is in.
|
||||
logger.V(5).Info("A node was added and it may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, err
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
|
||||
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
|
||||
|
||||
if originalHasKey && !modifiedHasKey {
|
||||
// Case 2: The original node have the pod anti-affinity topologyKey but the modified node does not.
|
||||
// Note that we don't need to check the opposite case (!originalHasKey && modifiedHasKey)
|
||||
// because the node without the topology label can always accept pods with pod anti-affinity.
|
||||
logger.V(5).Info("A node got updated to not have the topology key of pod anti-affinity, which may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
|
||||
// Case 3: Both nodes have the pod anti-affinity topologyKey, but the values differ.
|
||||
logger.V(5).Info("A node is moved to a different domain of pod anti-affinity, which may make the pod schedulable",
|
||||
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",
|
||||
|
10
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/scoring.go
generated
vendored
10
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity/scoring.go
generated
vendored
@ -130,10 +130,6 @@ func (pl *InterPodAffinity) PreScore(
|
||||
pod *v1.Pod,
|
||||
nodes []*framework.NodeInfo,
|
||||
) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
// No nodes to score.
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
if pl.sharedLister == nil {
|
||||
return framework.NewStatus(framework.Error, "empty shared lister in InterPodAffinity PreScore")
|
||||
@ -240,11 +236,7 @@ func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error)
|
||||
// The "score" returned in this function is the sum of weights got from cycleState which have its topologyKey matching with the node's labels.
|
||||
// it is normalized later.
|
||||
// Note: the returned "score" is positive for pod-affinity, and negative for pod-antiaffinity.
|
||||
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("failed to get node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
s, err := getPreScoreState(cycleState)
|
||||
|
@ -238,9 +238,6 @@ func (s *preScoreState) Clone() framework.StateData {
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
@ -259,12 +256,7 @@ func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.Cycl
|
||||
// Score returns the sum of the weights of the terms that match the Node.
|
||||
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
|
||||
// default affinity.
|
||||
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
var count int64
|
||||
|
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports/node_ports.go
generated
vendored
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports/node_ports.go
generated
vendored
@ -143,7 +143,7 @@ func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Po
|
||||
}
|
||||
|
||||
// If the deleted pod is unscheduled, it doesn't make the target pod schedulable.
|
||||
if deletedPod.Spec.NodeName == "" {
|
||||
if deletedPod.Spec.NodeName == "" && deletedPod.Status.NominatedNodeName == "" {
|
||||
logger.V(4).Info("the deleted pod is unscheduled and it doesn't make the target pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
@ -63,8 +63,15 @@ func (s *balancedAllocationPreScoreState) Clone() framework.StateData {
|
||||
|
||||
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
|
||||
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
podRequests := ba.calculatePodResourceRequestList(pod, ba.resources)
|
||||
if ba.isBestEffortPod(podRequests) {
|
||||
// Skip BalancedAllocation scoring for best-effort pods to
|
||||
// prevent a large number of pods from being scheduled to the same node.
|
||||
// See https://github.com/kubernetes/kubernetes/issues/129138 for details.
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
state := &balancedAllocationPreScoreState{
|
||||
podRequests: ba.calculatePodResourceRequestList(pod, ba.resources),
|
||||
podRequests: podRequests,
|
||||
}
|
||||
cycleState.Write(balancedAllocationPreScoreStateKey, state)
|
||||
return nil
|
||||
@ -89,15 +96,13 @@ func (ba *BalancedAllocation) Name() string {
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := ba.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
s, err := getBalancedAllocationPreScoreState(state)
|
||||
if err != nil {
|
||||
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
|
||||
if ba.isBestEffortPod(s.podRequests) {
|
||||
return 0, nil
|
||||
}
|
||||
}
|
||||
|
||||
// ba.score favors nodes with balanced resource usage rate.
|
||||
@ -127,10 +132,12 @@ func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h framework
|
||||
return &BalancedAllocation{
|
||||
handle: h,
|
||||
resourceAllocationScorer: resourceAllocationScorer{
|
||||
Name: BalancedAllocationName,
|
||||
scorer: balancedResourceScorer,
|
||||
useRequested: true,
|
||||
resources: args.Resources,
|
||||
Name: BalancedAllocationName,
|
||||
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
|
||||
enablePodLevelResources: fts.EnablePodLevelResources,
|
||||
scorer: balancedResourceScorer,
|
||||
useRequested: true,
|
||||
resources: args.Resources,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
@ -157,7 +164,6 @@ func balancedResourceScorer(requested, allocable []int64) int64 {
|
||||
// Otherwise, set the std to zero is enough.
|
||||
if len(resourceToFractions) == 2 {
|
||||
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
|
||||
|
||||
} else if len(resourceToFractions) > 2 {
|
||||
mean := totalFraction / float64(len(resourceToFractions))
|
||||
var sum float64
|
||||
|
11
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/fit.go
generated
vendored
11
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources/fit.go
generated
vendored
@ -21,7 +21,7 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp" //nolint:depguard
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
@ -294,7 +294,7 @@ func (f *Fit) isSchedulableAfterPodEvent(logger klog.Logger, pod *v1.Pod, oldObj
|
||||
}
|
||||
|
||||
if modifiedPod == nil {
|
||||
if originalPod.Spec.NodeName == "" {
|
||||
if originalPod.Spec.NodeName == "" && originalPod.Status.NominatedNodeName == "" {
|
||||
logger.V(5).Info("the deleted pod was unscheduled and it wouldn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
@ -579,12 +579,7 @@ func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo, ignor
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := f.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
s, err := getPreScoreState(state)
|
||||
if err != nil {
|
||||
s = &preScoreState{
|
||||
|
@ -21,11 +21,9 @@ import (
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
resourcehelper "k8s.io/component-helpers/resource"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
@ -36,7 +34,9 @@ type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
|
||||
|
||||
// resourceAllocationScorer contains information to calculate resource allocation score.
|
||||
type resourceAllocationScorer struct {
|
||||
Name string
|
||||
Name string
|
||||
enableInPlacePodVerticalScaling bool
|
||||
enablePodLevelResources bool
|
||||
// used to decide whether to use Requested or NonZeroRequested for
|
||||
// cpu and memory.
|
||||
useRequested bool
|
||||
@ -118,9 +118,9 @@ func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(logger kl
|
||||
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
|
||||
|
||||
opts := resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
UseStatusResources: r.enableInPlacePodVerticalScaling,
|
||||
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
||||
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
SkipPodLevelResources: !r.enablePodLevelResources,
|
||||
}
|
||||
|
||||
if !r.useRequested {
|
||||
@ -146,3 +146,12 @@ func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod,
|
||||
}
|
||||
return podRequests
|
||||
}
|
||||
|
||||
func (r *resourceAllocationScorer) isBestEffortPod(podRequests []int64) bool {
|
||||
for _, request := range podRequests {
|
||||
if request != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ limitations under the License.
|
||||
package noderesources
|
||||
|
||||
import (
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/google/go-cmp/cmp/cmpopts" //nolint:depguard
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
|
@ -68,7 +68,7 @@ func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.Cl
|
||||
// the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodToleration}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
108
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go
generated
vendored
108
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go
generated
vendored
@ -63,7 +63,8 @@ type CSILimits struct {
|
||||
scLister storagelisters.StorageClassLister
|
||||
vaLister storagelisters.VolumeAttachmentLister
|
||||
|
||||
randomVolumeIDPrefix string
|
||||
enableCSIMigrationPortworx bool
|
||||
randomVolumeIDPrefix string
|
||||
|
||||
translator InTreeToCSITranslator
|
||||
}
|
||||
@ -87,9 +88,10 @@ func (pl *CSILimits) EventsToRegister(_ context.Context) ([]framework.ClusterEve
|
||||
// We don't register any `QueueingHintFn` intentionally
|
||||
// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
|
||||
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSINodeUpdated},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
|
||||
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterPVCAdded},
|
||||
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}},
|
||||
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterVolumeAttachmentDeleted},
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -103,7 +105,7 @@ func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Po
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
if deletedPod.Spec.NodeName == "" {
|
||||
if deletedPod.Spec.NodeName == "" && deletedPod.Status.NominatedNodeName == "" {
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
@ -149,6 +151,85 @@ func (pl *CSILimits) isSchedulableAfterPVCAdded(logger klog.Logger, pod *v1.Pod,
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
func (pl *CSILimits) isSchedulableAfterVolumeAttachmentDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
deletedVolumeAttachment, _, err := util.As[*storagev1.VolumeAttachment](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterVolumeAttachmentDeleted: %w", err)
|
||||
}
|
||||
|
||||
for _, vol := range pod.Spec.Volumes {
|
||||
// Check if the pod volume uses a PVC
|
||||
// If it does, return Queue
|
||||
if vol.PersistentVolumeClaim != nil {
|
||||
logger.V(5).Info("Pod volume uses PersistentVolumeClaim, which might make this pod schedulable due to VolumeAttachment deletion", "pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment), "volume", vol.Name)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
|
||||
if !pl.translator.IsInlineMigratable(&vol) {
|
||||
continue
|
||||
}
|
||||
|
||||
translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(logger, &vol, pod.Namespace)
|
||||
if err != nil || translatedPV == nil {
|
||||
return framework.Queue, fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
|
||||
}
|
||||
|
||||
if translatedPV.Spec.CSI != nil && deletedVolumeAttachment.Spec.Attacher == translatedPV.Spec.CSI.Driver {
|
||||
// deleted VolumeAttachment Attacher matches the translated PV CSI driver
|
||||
logger.V(5).Info("Pod volume is an Inline Migratable volume that matches the CSI driver, which might make this pod schedulable due to VolumeAttachment deletion",
|
||||
"pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment),
|
||||
"volume", vol.Name, "csiDriver", translatedPV.Spec.CSI.Driver,
|
||||
)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
logger.V(5).Info("the VolumeAttachment deletion wouldn't make this pod schedulable because the pod has no volume related to a deleted VolumeAttachment",
|
||||
"pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment))
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
func (pl *CSILimits) isSchedulableAfterCSINodeUpdated(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
|
||||
oldCSINode, newCSINode, err := util.As[*storagev1.CSINode](oldObj, newObj)
|
||||
if err != nil {
|
||||
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterCSINodeUpdated: %w", err)
|
||||
}
|
||||
|
||||
oldLimits := make(map[string]int32)
|
||||
for _, d := range oldCSINode.Spec.Drivers {
|
||||
var count int32
|
||||
if d.Allocatable != nil && d.Allocatable.Count != nil {
|
||||
count = *d.Allocatable.Count
|
||||
}
|
||||
oldLimits[d.Name] = count
|
||||
}
|
||||
|
||||
// Compare new driver limits vs. old. If limit increased, queue pod.
|
||||
for _, d := range newCSINode.Spec.Drivers {
|
||||
var oldLimit int32
|
||||
if val, exists := oldLimits[d.Name]; exists {
|
||||
oldLimit = val
|
||||
}
|
||||
newLimit := int32(0)
|
||||
if d.Allocatable != nil && d.Allocatable.Count != nil {
|
||||
newLimit = *d.Allocatable.Count
|
||||
}
|
||||
|
||||
if newLimit > oldLimit {
|
||||
logger.V(5).Info("CSINode driver limit increased, might make this pod schedulable",
|
||||
"pod", klog.KObj(pod),
|
||||
"driver", d.Name,
|
||||
"oldLimit", oldLimit,
|
||||
"newLimit", newLimit,
|
||||
)
|
||||
return framework.Queue, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If no driver limit was increased, skip queueing.
|
||||
return framework.QueueSkip, nil
|
||||
}
|
||||
|
||||
// PreFilter invoked at the prefilter extension point
|
||||
//
|
||||
// If the pod haven't those types of volumes, we'll skip the Filter phase
|
||||
@ -339,7 +420,7 @@ func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Vol
|
||||
if err != nil {
|
||||
return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
|
||||
}
|
||||
if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
|
||||
if !isCSIMigrationOn(csiNode, inTreeProvisionerName, pl.enableCSIMigrationPortworx) {
|
||||
csiNodeName := ""
|
||||
if csiNode != nil {
|
||||
csiNodeName = csiNode.Name
|
||||
@ -400,7 +481,7 @@ func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSI
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if !isCSIMigrationOn(csiNode, pluginName) {
|
||||
if !isCSIMigrationOn(csiNode, pluginName, pl.enableCSIMigrationPortworx) {
|
||||
logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
|
||||
return "", ""
|
||||
}
|
||||
@ -448,7 +529,7 @@ func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storage
|
||||
|
||||
provisioner := storageClass.Provisioner
|
||||
if pl.translator.IsMigratableIntreePluginByName(provisioner) {
|
||||
if !isCSIMigrationOn(csiNode, provisioner) {
|
||||
if !isCSIMigrationOn(csiNode, provisioner, pl.enableCSIMigrationPortworx) {
|
||||
logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
|
||||
return "", ""
|
||||
}
|
||||
@ -475,13 +556,14 @@ func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts fe
|
||||
csiTranslator := csitrans.New()
|
||||
|
||||
return &CSILimits{
|
||||
csiNodeLister: csiNodesLister,
|
||||
pvLister: pvLister,
|
||||
pvcLister: pvcLister,
|
||||
scLister: scLister,
|
||||
vaLister: vaLister,
|
||||
randomVolumeIDPrefix: rand.String(32),
|
||||
translator: csiTranslator,
|
||||
csiNodeLister: csiNodesLister,
|
||||
pvLister: pvLister,
|
||||
pvcLister: pvcLister,
|
||||
scLister: scLister,
|
||||
vaLister: vaLister,
|
||||
enableCSIMigrationPortworx: fts.EnableCSIMigrationPortworx,
|
||||
randomVolumeIDPrefix: rand.String(32),
|
||||
translator: csiTranslator,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -22,14 +22,12 @@ import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
csilibplugins "k8s.io/csi-translation-lib/plugins"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
)
|
||||
|
||||
// isCSIMigrationOn returns a boolean value indicating whether
|
||||
// the CSI migration has been enabled for a particular storage plugin.
|
||||
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
|
||||
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string, enableCSIMigrationPortworx bool) bool {
|
||||
if csiNode == nil || len(pluginName) == 0 {
|
||||
return false
|
||||
}
|
||||
@ -40,7 +38,7 @@ func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
|
||||
case csilibplugins.AWSEBSInTreePluginName:
|
||||
return true
|
||||
case csilibplugins.PortworxVolumePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) {
|
||||
if !enableCSIMigrationPortworx {
|
||||
return false
|
||||
}
|
||||
case csilibplugins.GCEPDInTreePluginName:
|
||||
|
@ -27,11 +27,6 @@ import (
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
type topologyPair struct {
|
||||
key string
|
||||
value string
|
||||
}
|
||||
|
||||
// topologySpreadConstraint is an internal version for v1.TopologySpreadConstraint
|
||||
// and where the selector is parsed.
|
||||
// Fields are exported for comparison during testing.
|
||||
|
136
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go
generated
vendored
136
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread/filtering.go
generated
vendored
@ -19,6 +19,7 @@ package podtopologyspread
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@ -31,7 +32,7 @@ import (
|
||||
const preFilterStateKey = "PreFilter" + Name
|
||||
|
||||
// preFilterState computed at PreFilter and used at Filter.
|
||||
// It combines TpKeyToCriticalPaths and TpPairToMatchNum to represent:
|
||||
// It combines CriticalPaths and TpValueToMatchNum to represent:
|
||||
// (1) critical paths where the least pods are matched on each spread constraint.
|
||||
// (2) number of pods matched on each spread constraint.
|
||||
// A nil preFilterState denotes it's not set at all (in PreFilter phase);
|
||||
@ -39,29 +40,23 @@ const preFilterStateKey = "PreFilter" + Name
|
||||
// Fields are exported for comparison during testing.
|
||||
type preFilterState struct {
|
||||
Constraints []topologySpreadConstraint
|
||||
// We record 2 critical paths instead of all critical paths here.
|
||||
// criticalPaths[0].MatchNum always holds the minimum matching number.
|
||||
// criticalPaths[1].MatchNum is always greater or equal to criticalPaths[0].MatchNum, but
|
||||
// CriticalPaths is a slice indexed by constraint index.
|
||||
// Per each entry, we record 2 critical paths instead of all critical paths.
|
||||
// CriticalPaths[i][0].MatchNum always holds the minimum matching number.
|
||||
// CriticalPaths[i][1].MatchNum is always greater or equal to CriticalPaths[i][0].MatchNum, but
|
||||
// it's not guaranteed to be the 2nd minimum match number.
|
||||
TpKeyToCriticalPaths map[string]*criticalPaths
|
||||
// TpKeyToDomainsNum is keyed with topologyKey, and valued with the number of domains.
|
||||
TpKeyToDomainsNum map[string]int
|
||||
// TpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
|
||||
TpPairToMatchNum map[topologyPair]int
|
||||
CriticalPaths []*criticalPaths
|
||||
// TpValueToMatchNum is a slice indexed by constraint index.
|
||||
// Each entry is keyed with topology value, and valued with the number of matching pods.
|
||||
TpValueToMatchNum []map[string]int
|
||||
}
|
||||
|
||||
// minMatchNum returns the global minimum for the calculation of skew while taking MinDomains into account.
|
||||
func (s *preFilterState) minMatchNum(tpKey string, minDomains int32) (int, error) {
|
||||
paths, ok := s.TpKeyToCriticalPaths[tpKey]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("failed to retrieve path by topology key")
|
||||
}
|
||||
func (s *preFilterState) minMatchNum(constraintID int, minDomains int32) (int, error) {
|
||||
paths := s.CriticalPaths[constraintID]
|
||||
|
||||
minMatchNum := paths[0].MatchNum
|
||||
domainsNum, ok := s.TpKeyToDomainsNum[tpKey]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("failed to retrieve the number of domains by topology key")
|
||||
}
|
||||
domainsNum := len(s.TpValueToMatchNum[constraintID])
|
||||
|
||||
if domainsNum < int(minDomains) {
|
||||
// When the number of eligible domains with matching topology keys is less than `minDomains`,
|
||||
@ -79,17 +74,15 @@ func (s *preFilterState) Clone() framework.StateData {
|
||||
}
|
||||
copy := preFilterState{
|
||||
// Constraints are shared because they don't change.
|
||||
Constraints: s.Constraints,
|
||||
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(s.TpKeyToCriticalPaths)),
|
||||
// The number of domains does not change as a result of AddPod/RemovePod methods on PreFilter Extensions
|
||||
TpKeyToDomainsNum: s.TpKeyToDomainsNum,
|
||||
TpPairToMatchNum: make(map[topologyPair]int, len(s.TpPairToMatchNum)),
|
||||
Constraints: s.Constraints,
|
||||
CriticalPaths: make([]*criticalPaths, len(s.CriticalPaths)),
|
||||
TpValueToMatchNum: make([]map[string]int, len(s.TpValueToMatchNum)),
|
||||
}
|
||||
for tpKey, paths := range s.TpKeyToCriticalPaths {
|
||||
copy.TpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
|
||||
for i, paths := range s.CriticalPaths {
|
||||
copy.CriticalPaths[i] = &criticalPaths{paths[0], paths[1]}
|
||||
}
|
||||
for tpPair, matchNum := range s.TpPairToMatchNum {
|
||||
copy.TpPairToMatchNum[tpPair] = matchNum
|
||||
for i, tpMap := range s.TpValueToMatchNum {
|
||||
copy.TpValueToMatchNum[i] = maps.Clone(tpMap)
|
||||
}
|
||||
return ©
|
||||
}
|
||||
@ -200,7 +193,7 @@ func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemp
|
||||
}
|
||||
|
||||
podLabelSet := labels.Set(updatedPod.Labels)
|
||||
for _, constraint := range s.Constraints {
|
||||
for i, constraint := range s.Constraints {
|
||||
if !constraint.Selector.Matches(podLabelSet) {
|
||||
continue
|
||||
}
|
||||
@ -210,10 +203,9 @@ func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemp
|
||||
continue
|
||||
}
|
||||
|
||||
k, v := constraint.TopologyKey, node.Labels[constraint.TopologyKey]
|
||||
pair := topologyPair{key: k, value: v}
|
||||
s.TpPairToMatchNum[pair] += delta
|
||||
s.TpKeyToCriticalPaths[k].update(v, s.TpPairToMatchNum[pair])
|
||||
v := node.Labels[constraint.TopologyKey]
|
||||
s.TpValueToMatchNum[i][v] += delta
|
||||
s.CriticalPaths[i].update(v, s.TpValueToMatchNum[i][v])
|
||||
}
|
||||
}
|
||||
|
||||
@ -232,6 +224,12 @@ func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error
|
||||
return s, nil
|
||||
}
|
||||
|
||||
type topologyCount struct {
|
||||
topologyValue string
|
||||
constraintID int
|
||||
count int
|
||||
}
|
||||
|
||||
// calPreFilterState computes preFilterState describing how pods are spread on topologies.
|
||||
func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) (*preFilterState, error) {
|
||||
constraints, err := pl.getConstraints(pod)
|
||||
@ -248,15 +246,18 @@ func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod)
|
||||
}
|
||||
|
||||
s := preFilterState{
|
||||
Constraints: constraints,
|
||||
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
|
||||
TpPairToMatchNum: make(map[topologyPair]int, sizeHeuristic(len(allNodes), constraints)),
|
||||
Constraints: constraints,
|
||||
CriticalPaths: make([]*criticalPaths, len(constraints)),
|
||||
TpValueToMatchNum: make([]map[string]int, len(constraints)),
|
||||
}
|
||||
for i := 0; i < len(constraints); i++ {
|
||||
s.TpValueToMatchNum[i] = make(map[string]int, sizeHeuristic(len(allNodes), constraints[i]))
|
||||
}
|
||||
|
||||
tpCountsByNode := make([]map[topologyPair]int, len(allNodes))
|
||||
tpCountsByNode := make([][]topologyCount, len(allNodes))
|
||||
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
processNode := func(n int) {
|
||||
nodeInfo := allNodes[n]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
@ -272,38 +273,39 @@ func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod)
|
||||
return
|
||||
}
|
||||
|
||||
tpCounts := make(map[topologyPair]int, len(constraints))
|
||||
for _, c := range constraints {
|
||||
tpCounts := make([]topologyCount, 0, len(constraints))
|
||||
for i, c := range constraints {
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
|
||||
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
|
||||
continue
|
||||
}
|
||||
|
||||
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
|
||||
value := node.Labels[c.TopologyKey]
|
||||
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
|
||||
tpCounts[pair] = count
|
||||
tpCounts = append(tpCounts, topologyCount{
|
||||
topologyValue: value,
|
||||
constraintID: i,
|
||||
count: count,
|
||||
})
|
||||
}
|
||||
tpCountsByNode[i] = tpCounts
|
||||
tpCountsByNode[n] = tpCounts
|
||||
}
|
||||
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
|
||||
|
||||
for _, tpCounts := range tpCountsByNode {
|
||||
for tp, count := range tpCounts {
|
||||
s.TpPairToMatchNum[tp] += count
|
||||
// tpCounts might not hold all the constraints, so index can't be used here as constraintID.
|
||||
for _, tpCount := range tpCounts {
|
||||
s.TpValueToMatchNum[tpCount.constraintID][tpCount.topologyValue] += tpCount.count
|
||||
}
|
||||
}
|
||||
s.TpKeyToDomainsNum = make(map[string]int, len(constraints))
|
||||
for tp := range s.TpPairToMatchNum {
|
||||
s.TpKeyToDomainsNum[tp.key]++
|
||||
}
|
||||
|
||||
// calculate min match for each topology pair
|
||||
// calculate min match for each constraint and topology value
|
||||
for i := 0; i < len(constraints); i++ {
|
||||
key := constraints[i].TopologyKey
|
||||
s.TpKeyToCriticalPaths[key] = newCriticalPaths()
|
||||
}
|
||||
for pair, num := range s.TpPairToMatchNum {
|
||||
s.TpKeyToCriticalPaths[pair.key].update(pair.value, num)
|
||||
s.CriticalPaths[i] = newCriticalPaths()
|
||||
|
||||
for value, num := range s.TpValueToMatchNum[i] {
|
||||
s.CriticalPaths[i].update(value, num)
|
||||
}
|
||||
}
|
||||
|
||||
return &s, nil
|
||||
@ -325,19 +327,19 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
|
||||
|
||||
logger := klog.FromContext(ctx)
|
||||
podLabelSet := labels.Set(pod.Labels)
|
||||
for _, c := range s.Constraints {
|
||||
for i, c := range s.Constraints {
|
||||
tpKey := c.TopologyKey
|
||||
tpVal, ok := node.Labels[c.TopologyKey]
|
||||
tpVal, ok := node.Labels[tpKey]
|
||||
if !ok {
|
||||
logger.V(5).Info("Node doesn't have required label", "node", klog.KObj(node), "label", tpKey)
|
||||
logger.V(5).Info("Node doesn't have required topology label for spread constraint", "node", klog.KObj(node), "topologyKey", tpKey)
|
||||
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonNodeLabelNotMatch)
|
||||
}
|
||||
|
||||
// judging criteria:
|
||||
// 'existing matching num' + 'if self-match (1 or 0)' - 'global minimum' <= 'maxSkew'
|
||||
minMatchNum, err := s.minMatchNum(tpKey, c.MinDomains)
|
||||
minMatchNum, err := s.minMatchNum(i, c.MinDomains)
|
||||
if err != nil {
|
||||
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.TpKeyToCriticalPaths)
|
||||
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.CriticalPaths[i])
|
||||
continue
|
||||
}
|
||||
|
||||
@ -346,11 +348,7 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
|
||||
selfMatchNum = 1
|
||||
}
|
||||
|
||||
pair := topologyPair{key: tpKey, value: tpVal}
|
||||
matchNum := 0
|
||||
if tpCount, ok := s.TpPairToMatchNum[pair]; ok {
|
||||
matchNum = tpCount
|
||||
}
|
||||
matchNum := s.TpValueToMatchNum[i][tpVal]
|
||||
skew := matchNum + selfMatchNum - minMatchNum
|
||||
if skew > int(c.MaxSkew) {
|
||||
logger.V(5).Info("Node failed spreadConstraint: matchNum + selfMatchNum - minMatchNum > maxSkew", "node", klog.KObj(node), "topologyKey", tpKey, "matchNum", matchNum, "selfMatchNum", selfMatchNum, "minMatchNum", minMatchNum, "maxSkew", c.MaxSkew)
|
||||
@ -361,11 +359,9 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
|
||||
return nil
|
||||
}
|
||||
|
||||
func sizeHeuristic(nodes int, constraints []topologySpreadConstraint) int {
|
||||
for _, c := range constraints {
|
||||
if c.TopologyKey == v1.LabelHostname {
|
||||
return nodes
|
||||
}
|
||||
func sizeHeuristic(nodes int, constraint topologySpreadConstraint) int {
|
||||
if constraint.TopologyKey == v1.LabelHostname {
|
||||
return nodes
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
@ -146,8 +146,8 @@ func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.Cl
|
||||
//
|
||||
// The Pod rejected by this plugin can be schedulable when the Pod has a spread constraint with NodeTaintsPolicy:Honor
|
||||
// and has got a new toleration.
|
||||
// So, we add UpdatePodTolerations here only when QHint is enabled.
|
||||
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodTolerations | framework.Delete
|
||||
// So, we add UpdatePodToleration here only when QHint is enabled.
|
||||
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodToleration | framework.Delete
|
||||
}
|
||||
|
||||
return []framework.ClusterEventWithHint{
|
||||
|
@ -37,8 +37,9 @@ type preScoreState struct {
|
||||
Constraints []topologySpreadConstraint
|
||||
// IgnoredNodes is a set of node names which miss some Constraints[*].topologyKey.
|
||||
IgnoredNodes sets.Set[string]
|
||||
// TopologyPairToPodCounts is keyed with topologyPair, and valued with the number of matching pods.
|
||||
TopologyPairToPodCounts map[topologyPair]*int64
|
||||
// TopologyValueToPodCounts is a slice indexed by constraint index.
|
||||
// Each entry is keyed with topology value, and valued with the number of matching pods.
|
||||
TopologyValueToPodCounts []map[string]*int64
|
||||
// TopologyNormalizingWeight is the weight we give to the counts per topology.
|
||||
// This allows the pod counts of smaller topologies to not be watered down by
|
||||
// bigger ones.
|
||||
@ -76,6 +77,10 @@ func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, fi
|
||||
if len(s.Constraints) == 0 {
|
||||
return nil
|
||||
}
|
||||
s.TopologyValueToPodCounts = make([]map[string]*int64, len(s.Constraints))
|
||||
for i := 0; i < len(s.Constraints); i++ {
|
||||
s.TopologyValueToPodCounts[i] = make(map[string]*int64)
|
||||
}
|
||||
topoSize := make([]int, len(s.Constraints))
|
||||
for _, node := range filteredNodes {
|
||||
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Node().Labels, s.Constraints) {
|
||||
@ -89,9 +94,9 @@ func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, fi
|
||||
if constraint.TopologyKey == v1.LabelHostname {
|
||||
continue
|
||||
}
|
||||
pair := topologyPair{key: constraint.TopologyKey, value: node.Node().Labels[constraint.TopologyKey]}
|
||||
if s.TopologyPairToPodCounts[pair] == nil {
|
||||
s.TopologyPairToPodCounts[pair] = new(int64)
|
||||
value := node.Node().Labels[constraint.TopologyKey]
|
||||
if s.TopologyValueToPodCounts[i][value] == nil {
|
||||
s.TopologyValueToPodCounts[i][value] = new(int64)
|
||||
topoSize[i]++
|
||||
}
|
||||
}
|
||||
@ -126,8 +131,7 @@ func (pl *PodTopologySpread) PreScore(
|
||||
}
|
||||
|
||||
state := &preScoreState{
|
||||
IgnoredNodes: sets.New[string](),
|
||||
TopologyPairToPodCounts: make(map[topologyPair]*int64),
|
||||
IgnoredNodes: sets.New[string](),
|
||||
}
|
||||
// Only require that nodes have all the topology labels if using
|
||||
// non-system-default spreading rules. This allows nodes that don't have a
|
||||
@ -145,8 +149,8 @@ func (pl *PodTopologySpread) PreScore(
|
||||
|
||||
// Ignore parsing errors for backwards compatibility.
|
||||
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
|
||||
processAllNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
processAllNode := func(n int) {
|
||||
nodeInfo := allNodes[n]
|
||||
node := nodeInfo.Node()
|
||||
|
||||
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
|
||||
@ -161,17 +165,17 @@ func (pl *PodTopologySpread) PreScore(
|
||||
return
|
||||
}
|
||||
|
||||
for _, c := range state.Constraints {
|
||||
for i, c := range state.Constraints {
|
||||
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
|
||||
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
|
||||
continue
|
||||
}
|
||||
|
||||
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
|
||||
value := node.Labels[c.TopologyKey]
|
||||
// If current topology pair is not associated with any candidate node,
|
||||
// continue to avoid unnecessary calculation.
|
||||
// Per-node counts are also skipped, as they are done during Score.
|
||||
tpCount := state.TopologyPairToPodCounts[pair]
|
||||
tpCount := state.TopologyValueToPodCounts[i][value]
|
||||
if tpCount == nil {
|
||||
continue
|
||||
}
|
||||
@ -188,12 +192,7 @@ func (pl *PodTopologySpread) PreScore(
|
||||
// Score invoked at the Score extension point.
|
||||
// The "score" returned in this function is the matching number of pods on the `nodeName`,
|
||||
// it is normalized later.
|
||||
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
|
||||
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
node := nodeInfo.Node()
|
||||
s, err := getPreScoreState(cycleState)
|
||||
if err != nil {
|
||||
@ -214,8 +213,7 @@ func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.Cy
|
||||
if c.TopologyKey == v1.LabelHostname {
|
||||
cnt = int64(countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace))
|
||||
} else {
|
||||
pair := topologyPair{key: c.TopologyKey, value: tpVal}
|
||||
cnt = *s.TopologyPairToPodCounts[pair]
|
||||
cnt = *s.TopologyValueToPodCounts[i][tpVal]
|
||||
}
|
||||
score += scoreForCount(cnt, c.MaxSkew, s.TopologyNormalizingWeight[i])
|
||||
}
|
||||
|
7
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/registry.go
generated
vendored
7
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/registry.go
generated
vendored
@ -46,9 +46,12 @@ import (
|
||||
// through the WithFrameworkOutOfTreeRegistry option.
|
||||
func NewInTreeRegistry() runtime.Registry {
|
||||
fts := plfeature.Features{
|
||||
EnableDRAPrioritizedList: feature.DefaultFeatureGate.Enabled(features.DRAPrioritizedList),
|
||||
EnableDRAAdminAccess: feature.DefaultFeatureGate.Enabled(features.DRAAdminAccess),
|
||||
EnableDRADeviceTaints: feature.DefaultFeatureGate.Enabled(features.DRADeviceTaints),
|
||||
EnableDynamicResourceAllocation: feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
|
||||
EnableVolumeCapacityPriority: feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
|
||||
EnableVolumeAttributesClass: feature.DefaultFeatureGate.Enabled(features.VolumeAttributesClass),
|
||||
EnableCSIMigrationPortworx: feature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx),
|
||||
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
|
||||
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
|
||||
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
@ -56,6 +59,8 @@ func NewInTreeRegistry() runtime.Registry {
|
||||
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
|
||||
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
|
||||
EnablePodLevelResources: feature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
EnablePartitionableDevices: feature.DefaultFeatureGate.Enabled(features.DRAPartitionableDevices),
|
||||
EnableStorageCapacityScoring: feature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring),
|
||||
}
|
||||
|
||||
registry := runtime.Registry{
|
||||
|
@ -67,7 +67,7 @@ func (pl *TaintToleration) EventsToRegister(_ context.Context) ([]framework.Clus
|
||||
// the scheduling queue uses Pod/Update Queueing Hint
|
||||
// to determine whether a Pod's update makes the Pod schedulable or not.
|
||||
// https://github.com/kubernetes/kubernetes/pull/122234
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodToleration}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -143,9 +143,6 @@ func getAllTolerationPreferNoSchedule(tolerations []v1.Toleration) (tolerationLi
|
||||
|
||||
// PreScore builds and writes cycle state used by Score and NormalizeScore.
|
||||
func (pl *TaintToleration) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
|
||||
if len(nodes) == 0 {
|
||||
return nil
|
||||
}
|
||||
tolerationsPreferNoSchedule := getAllTolerationPreferNoSchedule(pod.Spec.Tolerations)
|
||||
state := &preScoreState{
|
||||
tolerationsPreferNoSchedule: tolerationsPreferNoSchedule,
|
||||
@ -183,11 +180,7 @@ func countIntolerableTaintsPreferNoSchedule(taints []v1.Taint, tolerations []v1.
|
||||
}
|
||||
|
||||
// Score invoked at the Score extension point.
|
||||
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
|
||||
}
|
||||
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
node := nodeInfo.Node()
|
||||
|
||||
s, err := getPreScoreState(state)
|
||||
|
104
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/binder.go
generated
vendored
104
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/binder.go
generated
vendored
@ -33,7 +33,6 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/apiserver/pkg/storage"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
coreinformers "k8s.io/client-go/informers/core/v1"
|
||||
storageinformers "k8s.io/client-go/informers/storage/v1"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
@ -45,7 +44,7 @@ import (
|
||||
csiplugins "k8s.io/csi-translation-lib/plugins"
|
||||
"k8s.io/klog/v2"
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
|
||||
)
|
||||
@ -65,7 +64,7 @@ const (
|
||||
// ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error.
|
||||
ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind"
|
||||
// ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error.
|
||||
ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict"
|
||||
ErrReasonNodeConflict ConflictReason = "node(s) didn't match PersistentVolume's node affinity"
|
||||
// ErrReasonNotEnoughSpace is used when a pod cannot start on a node because not enough storage space is available.
|
||||
ErrReasonNotEnoughSpace = "node(s) did not have enough free storage"
|
||||
// ErrReasonPVNotExist is used when a pod has one or more PVC(s) bound to non-existent persistent volume(s)"
|
||||
@ -103,13 +102,19 @@ func (b *BindingInfo) StorageResource() *StorageResource {
|
||||
}
|
||||
}
|
||||
|
||||
// DynamicProvision represents a dynamically provisioned volume.
|
||||
type DynamicProvision struct {
|
||||
PVC *v1.PersistentVolumeClaim
|
||||
NodeCapacity *storagev1.CSIStorageCapacity
|
||||
}
|
||||
|
||||
// PodVolumes holds pod's volumes information used in volume scheduling.
|
||||
type PodVolumes struct {
|
||||
// StaticBindings are binding decisions for PVCs which can be bound to
|
||||
// pre-provisioned static PVs.
|
||||
StaticBindings []*BindingInfo
|
||||
// DynamicProvisions are PVCs that require dynamic provisioning
|
||||
DynamicProvisions []*v1.PersistentVolumeClaim
|
||||
DynamicProvisions []*DynamicProvision
|
||||
}
|
||||
|
||||
// InTreeToCSITranslator contains methods required to check migratable status
|
||||
@ -203,7 +208,9 @@ type PodVolumeClaims struct {
|
||||
}
|
||||
|
||||
type volumeBinder struct {
|
||||
kubeClient clientset.Interface
|
||||
kubeClient clientset.Interface
|
||||
enableVolumeAttributesClass bool
|
||||
enableCSIMigrationPortworx bool
|
||||
|
||||
classLister storagelisters.StorageClassLister
|
||||
podLister corelisters.PodLister
|
||||
@ -238,6 +245,7 @@ type CapacityCheck struct {
|
||||
func NewVolumeBinder(
|
||||
logger klog.Logger,
|
||||
kubeClient clientset.Interface,
|
||||
fts feature.Features,
|
||||
podInformer coreinformers.PodInformer,
|
||||
nodeInformer coreinformers.NodeInformer,
|
||||
csiNodeInformer storageinformers.CSINodeInformer,
|
||||
@ -247,15 +255,17 @@ func NewVolumeBinder(
|
||||
capacityCheck CapacityCheck,
|
||||
bindTimeout time.Duration) SchedulerVolumeBinder {
|
||||
b := &volumeBinder{
|
||||
kubeClient: kubeClient,
|
||||
podLister: podInformer.Lister(),
|
||||
classLister: storageClassInformer.Lister(),
|
||||
nodeLister: nodeInformer.Lister(),
|
||||
csiNodeLister: csiNodeInformer.Lister(),
|
||||
pvcCache: NewPVCAssumeCache(logger, pvcInformer.Informer()),
|
||||
pvCache: NewPVAssumeCache(logger, pvInformer.Informer()),
|
||||
bindTimeout: bindTimeout,
|
||||
translator: csitrans.New(),
|
||||
kubeClient: kubeClient,
|
||||
enableVolumeAttributesClass: fts.EnableVolumeAttributesClass,
|
||||
enableCSIMigrationPortworx: fts.EnableCSIMigrationPortworx,
|
||||
podLister: podInformer.Lister(),
|
||||
classLister: storageClassInformer.Lister(),
|
||||
nodeLister: nodeInformer.Lister(),
|
||||
csiNodeLister: csiNodeInformer.Lister(),
|
||||
pvcCache: NewPVCAssumeCache(logger, pvcInformer.Informer()),
|
||||
pvCache: NewPVAssumeCache(logger, pvInformer.Informer()),
|
||||
bindTimeout: bindTimeout,
|
||||
translator: csitrans.New(),
|
||||
}
|
||||
|
||||
b.csiDriverLister = capacityCheck.CSIDriverInformer.Lister()
|
||||
@ -306,7 +316,7 @@ func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolume
|
||||
|
||||
var (
|
||||
staticBindings []*BindingInfo
|
||||
dynamicProvisions []*v1.PersistentVolumeClaim
|
||||
dynamicProvisions []*DynamicProvision
|
||||
)
|
||||
defer func() {
|
||||
// Although we do not distinguish nil from empty in this function, for
|
||||
@ -373,6 +383,16 @@ func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolume
|
||||
return
|
||||
}
|
||||
|
||||
// ConvertDynamicProvisionsToPVCs converts a slice of *DynamicProvision to a
|
||||
// slice of PersistentVolumeClaim
|
||||
func convertDynamicProvisionsToPVCs(dynamicProvisions []*DynamicProvision) []*v1.PersistentVolumeClaim {
|
||||
pvcs := make([]*v1.PersistentVolumeClaim, 0, len(dynamicProvisions))
|
||||
for _, dynamicProvision := range dynamicProvisions {
|
||||
pvcs = append(pvcs, dynamicProvision.PVC)
|
||||
}
|
||||
return pvcs
|
||||
}
|
||||
|
||||
// AssumePodVolumes will take the matching PVs and PVCs to provision in pod's
|
||||
// volume information for the chosen node, and:
|
||||
// 1. Update the pvCache with the new prebound PV.
|
||||
@ -419,20 +439,21 @@ func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod,
|
||||
}
|
||||
|
||||
// Assume PVCs
|
||||
newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
|
||||
for _, claim := range podVolumes.DynamicProvisions {
|
||||
newProvisionedPVCs := []*DynamicProvision{}
|
||||
for _, dynamicProvision := range podVolumes.DynamicProvisions {
|
||||
// The claims from method args can be pointing to watcher cache. We must not
|
||||
// modify these, therefore create a copy.
|
||||
claimClone := claim.DeepCopy()
|
||||
claimClone := dynamicProvision.PVC.DeepCopy()
|
||||
metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, volume.AnnSelectedNode, nodeName)
|
||||
err = b.pvcCache.Assume(claimClone)
|
||||
if err != nil {
|
||||
pvcs := convertDynamicProvisionsToPVCs(newProvisionedPVCs)
|
||||
b.revertAssumedPVs(newBindings)
|
||||
b.revertAssumedPVCs(newProvisionedPVCs)
|
||||
b.revertAssumedPVCs(pvcs)
|
||||
return
|
||||
}
|
||||
|
||||
newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
|
||||
newProvisionedPVCs = append(newProvisionedPVCs, &DynamicProvision{PVC: claimClone})
|
||||
}
|
||||
|
||||
podVolumes.StaticBindings = newBindings
|
||||
@ -442,8 +463,9 @@ func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod,
|
||||
|
||||
// RevertAssumedPodVolumes will revert assumed PV and PVC cache.
|
||||
func (b *volumeBinder) RevertAssumedPodVolumes(podVolumes *PodVolumes) {
|
||||
pvcs := convertDynamicProvisionsToPVCs(podVolumes.DynamicProvisions)
|
||||
b.revertAssumedPVs(podVolumes.StaticBindings)
|
||||
b.revertAssumedPVCs(podVolumes.DynamicProvisions)
|
||||
b.revertAssumedPVCs(pvcs)
|
||||
}
|
||||
|
||||
// BindPodVolumes gets the cached bindings and PVCs to provision in pod's volumes information,
|
||||
@ -460,7 +482,7 @@ func (b *volumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, p
|
||||
}()
|
||||
|
||||
bindings := podVolumes.StaticBindings
|
||||
claimsToProvision := podVolumes.DynamicProvisions
|
||||
claimsToProvision := convertDynamicProvisionsToPVCs(podVolumes.DynamicProvisions)
|
||||
|
||||
// Start API operations
|
||||
err = b.bindAPIUpdate(ctx, assumedPod, bindings, claimsToProvision)
|
||||
@ -855,7 +877,7 @@ func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, clai
|
||||
pvs := unboundVolumesDelayBinding[storageClassName]
|
||||
|
||||
// Find a matching PV
|
||||
pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true, utilfeature.DefaultFeatureGate.Enabled(features.VolumeAttributesClass))
|
||||
pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true, b.enableVolumeAttributesClass)
|
||||
if err != nil {
|
||||
return false, nil, nil, err
|
||||
}
|
||||
@ -882,8 +904,8 @@ func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, clai
|
||||
// checkVolumeProvisions checks given unbound claims (the claims have gone through func
|
||||
// findMatchingVolumes, and do not have matching volumes for binding), and return true
|
||||
// if all of the claims are eligible for dynamic provision.
|
||||
func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*v1.PersistentVolumeClaim, err error) {
|
||||
dynamicProvisions = []*v1.PersistentVolumeClaim{}
|
||||
func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*DynamicProvision, err error) {
|
||||
dynamicProvisions = []*DynamicProvision{}
|
||||
|
||||
// We return early with provisionedClaims == nil if a check
|
||||
// fails or we encounter an error.
|
||||
@ -911,7 +933,7 @@ func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, cl
|
||||
}
|
||||
|
||||
// Check storage capacity.
|
||||
sufficient, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
|
||||
sufficient, capacity, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
|
||||
if err != nil {
|
||||
return false, false, nil, err
|
||||
}
|
||||
@ -920,8 +942,10 @@ func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, cl
|
||||
return true, false, nil, nil
|
||||
}
|
||||
|
||||
dynamicProvisions = append(dynamicProvisions, claim)
|
||||
|
||||
dynamicProvisions = append(dynamicProvisions, &DynamicProvision{
|
||||
PVC: claim,
|
||||
NodeCapacity: capacity,
|
||||
})
|
||||
}
|
||||
logger.V(4).Info("Provisioning for claims of pod that has no matching volumes...", "claimCount", len(claimsToProvision), "pod", klog.KObj(pod), "node", klog.KObj(node))
|
||||
|
||||
@ -941,12 +965,12 @@ func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
|
||||
}
|
||||
|
||||
// hasEnoughCapacity checks whether the provisioner has enough capacity left for a new volume of the given size
|
||||
// that is available from the node.
|
||||
func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, error) {
|
||||
// that is available from the node. This function returns the node capacity based on the PVC's storage class.
|
||||
func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, *storagev1.CSIStorageCapacity, error) {
|
||||
quantity, ok := claim.Spec.Resources.Requests[v1.ResourceStorage]
|
||||
if !ok {
|
||||
// No capacity to check for.
|
||||
return true, nil
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
// Only enabled for CSI drivers which opt into it.
|
||||
@ -956,19 +980,19 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
|
||||
// Either the provisioner is not a CSI driver or the driver does not
|
||||
// opt into storage capacity scheduling. Either way, skip
|
||||
// capacity checking.
|
||||
return true, nil
|
||||
return true, nil, nil
|
||||
}
|
||||
return false, err
|
||||
return false, nil, err
|
||||
}
|
||||
if driver.Spec.StorageCapacity == nil || !*driver.Spec.StorageCapacity {
|
||||
return true, nil
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
// Look for a matching CSIStorageCapacity object(s).
|
||||
// TODO (for beta): benchmark this and potentially introduce some kind of lookup structure (https://github.com/kubernetes/enhancements/issues/1698#issuecomment-654356718).
|
||||
capacities, err := b.csiStorageCapacityLister.List(labels.Everything())
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, nil, err
|
||||
}
|
||||
|
||||
sizeInBytes := quantity.Value()
|
||||
@ -977,7 +1001,7 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
|
||||
capacitySufficient(capacity, sizeInBytes) &&
|
||||
b.nodeHasAccess(logger, node, capacity) {
|
||||
// Enough capacity found.
|
||||
return true, nil
|
||||
return true, capacity, nil
|
||||
}
|
||||
}
|
||||
|
||||
@ -985,7 +1009,7 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
|
||||
// they had to be rejected. Log that above? But that might be a lot of log output...
|
||||
logger.V(4).Info("Node has no accessible CSIStorageCapacity with enough capacity for PVC",
|
||||
"node", klog.KObj(node), "PVC", klog.KObj(claim), "size", sizeInBytes, "storageClass", klog.KObj(storageClass))
|
||||
return false, nil
|
||||
return false, nil, nil
|
||||
}
|
||||
|
||||
func capacitySufficient(capacity *storagev1.CSIStorageCapacity, sizeInBytes int64) bool {
|
||||
@ -1033,7 +1057,7 @@ func (a byPVCSize) Less(i, j int) bool {
|
||||
}
|
||||
|
||||
// isCSIMigrationOnForPlugin checks if CSI migration is enabled for a given plugin.
|
||||
func isCSIMigrationOnForPlugin(pluginName string) bool {
|
||||
func isCSIMigrationOnForPlugin(pluginName string, enableCSIMigrationPortworx bool) bool {
|
||||
switch pluginName {
|
||||
case csiplugins.AWSEBSInTreePluginName:
|
||||
return true
|
||||
@ -1044,7 +1068,7 @@ func isCSIMigrationOnForPlugin(pluginName string) bool {
|
||||
case csiplugins.CinderInTreePluginName:
|
||||
return true
|
||||
case csiplugins.PortworxVolumePluginName:
|
||||
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx)
|
||||
return enableCSIMigrationPortworx
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -1083,7 +1107,7 @@ func (b *volumeBinder) tryTranslatePVToCSI(logger klog.Logger, pv *v1.Persistent
|
||||
return nil, fmt.Errorf("could not get plugin name from pv: %v", err)
|
||||
}
|
||||
|
||||
if !isCSIMigrationOnForPlugin(pluginName) {
|
||||
if !isCSIMigrationOnForPlugin(pluginName, b.enableCSIMigrationPortworx) {
|
||||
return pv, nil
|
||||
}
|
||||
|
||||
|
@ -29,6 +29,7 @@ import (
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
storagelisters "k8s.io/client-go/listers/storage/v1"
|
||||
"k8s.io/component-helpers/storage/ephemeral"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/scheduler/apis/config"
|
||||
@ -70,10 +71,11 @@ func (d *stateData) Clone() framework.StateData {
|
||||
// In the Filter phase, pod binding cache is created for the pod and used in
|
||||
// Reserve and PreBind phases.
|
||||
type VolumeBinding struct {
|
||||
Binder SchedulerVolumeBinder
|
||||
PVCLister corelisters.PersistentVolumeClaimLister
|
||||
scorer volumeCapacityScorer
|
||||
fts feature.Features
|
||||
Binder SchedulerVolumeBinder
|
||||
PVCLister corelisters.PersistentVolumeClaimLister
|
||||
classLister storagelisters.StorageClassLister
|
||||
scorer volumeCapacityScorer
|
||||
fts feature.Features
|
||||
}
|
||||
|
||||
var _ framework.PreFilterPlugin = &VolumeBinding{}
|
||||
@ -451,14 +453,14 @@ func (pl *VolumeBinding) PreScore(ctx context.Context, cs *framework.CycleState,
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
if state.hasStaticBindings {
|
||||
if state.hasStaticBindings || pl.fts.EnableStorageCapacityScoring {
|
||||
return nil
|
||||
}
|
||||
return framework.NewStatus(framework.Skip)
|
||||
}
|
||||
|
||||
// Score invoked at the score extension point.
|
||||
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
if pl.scorer == nil {
|
||||
return 0, nil
|
||||
}
|
||||
@ -466,24 +468,49 @@ func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, po
|
||||
if err != nil {
|
||||
return 0, framework.AsStatus(err)
|
||||
}
|
||||
nodeName := nodeInfo.Node().Name
|
||||
podVolumes, ok := state.podVolumesByNode[nodeName]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
}
|
||||
// group by storage class
|
||||
|
||||
classResources := make(classResourceMap)
|
||||
for _, staticBinding := range podVolumes.StaticBindings {
|
||||
class := staticBinding.StorageClassName()
|
||||
storageResource := staticBinding.StorageResource()
|
||||
if _, ok := classResources[class]; !ok {
|
||||
classResources[class] = &StorageResource{
|
||||
Requested: 0,
|
||||
Capacity: 0,
|
||||
if len(podVolumes.StaticBindings) != 0 || !pl.fts.EnableStorageCapacityScoring {
|
||||
// group static binding volumes by storage class
|
||||
for _, staticBinding := range podVolumes.StaticBindings {
|
||||
class := staticBinding.StorageClassName()
|
||||
storageResource := staticBinding.StorageResource()
|
||||
if _, ok := classResources[class]; !ok {
|
||||
classResources[class] = &StorageResource{
|
||||
Requested: 0,
|
||||
Capacity: 0,
|
||||
}
|
||||
}
|
||||
classResources[class].Requested += storageResource.Requested
|
||||
classResources[class].Capacity += storageResource.Capacity
|
||||
}
|
||||
} else {
|
||||
// group dynamic binding volumes by storage class
|
||||
for _, provision := range podVolumes.DynamicProvisions {
|
||||
if provision.NodeCapacity == nil {
|
||||
continue
|
||||
}
|
||||
class := *provision.PVC.Spec.StorageClassName
|
||||
if _, ok := classResources[class]; !ok {
|
||||
classResources[class] = &StorageResource{
|
||||
Requested: 0,
|
||||
Capacity: 0,
|
||||
}
|
||||
}
|
||||
// The following line cannot be +=. For example, if a Pod requests two 50GB volumes from
|
||||
// a StorageClass with 100GB of capacity on a node, this part of the code will be executed twice.
|
||||
// In that case, using += would incorrectly set classResources[class].Capacity to 200GB.
|
||||
classResources[class].Capacity = provision.NodeCapacity.Capacity.Value()
|
||||
requestedQty := provision.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
|
||||
classResources[class].Requested += requestedQty.Value()
|
||||
}
|
||||
classResources[class].Requested += storageResource.Requested
|
||||
classResources[class].Capacity += storageResource.Capacity
|
||||
}
|
||||
|
||||
return pl.scorer(classResources), nil
|
||||
}
|
||||
|
||||
@ -565,7 +592,7 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
|
||||
return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
|
||||
}
|
||||
if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
|
||||
AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
|
||||
AllowStorageCapacityScoring: fts.EnableStorageCapacityScoring,
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -579,11 +606,11 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
|
||||
CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
|
||||
CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1().CSIStorageCapacities(),
|
||||
}
|
||||
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
|
||||
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), fts, podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
|
||||
|
||||
// build score function
|
||||
var scorer volumeCapacityScorer
|
||||
if fts.EnableVolumeCapacityPriority {
|
||||
if fts.EnableStorageCapacityScoring {
|
||||
shape := make(helper.FunctionShape, 0, len(args.Shape))
|
||||
for _, point := range args.Shape {
|
||||
shape = append(shape, helper.FunctionShapePoint{
|
||||
@ -594,9 +621,10 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
|
||||
scorer = buildScorerFunction(shape)
|
||||
}
|
||||
return &VolumeBinding{
|
||||
Binder: binder,
|
||||
PVCLister: pvcInformer.Lister(),
|
||||
scorer: scorer,
|
||||
fts: fts,
|
||||
Binder: binder,
|
||||
PVCLister: pvcInformer.Lister(),
|
||||
classLister: storageClassInformer.Lister(),
|
||||
scorer: scorer,
|
||||
fts: fts,
|
||||
}, nil
|
||||
}
|
||||
|
13
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/preemption/preemption.go
generated
vendored
13
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/preemption/preemption.go
generated
vendored
@ -40,7 +40,6 @@ import (
|
||||
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
|
||||
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
@ -149,7 +148,7 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsy
|
||||
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
|
||||
|
||||
ev := &Evaluator{
|
||||
PluginName: names.DefaultPreemption,
|
||||
PluginName: pluginName,
|
||||
Handler: fh,
|
||||
PodLister: podLister,
|
||||
PdbLister: pdbLister,
|
||||
@ -172,10 +171,11 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsy
|
||||
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
|
||||
} else {
|
||||
condition := &v1.PodCondition{
|
||||
Type: v1.DisruptionTarget,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: v1.PodReasonPreemptionByScheduler,
|
||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
|
||||
Type: v1.DisruptionTarget,
|
||||
ObservedGeneration: apipod.GetPodObservedGenerationIfEnabledOnCondition(&victim.Status, victim.Generation, v1.DisruptionTarget),
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: v1.PodReasonPreemptionByScheduler,
|
||||
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
|
||||
}
|
||||
newStatus := victim.Status.DeepCopy()
|
||||
updated := apipod.UpdatePodCondition(newStatus, condition)
|
||||
@ -261,6 +261,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, p
|
||||
|
||||
// Return a FitError only when there are no candidates that fit the pod.
|
||||
if len(candidates) == 0 {
|
||||
logger.V(2).Info("No preemption candidate is found; preemption is not helpful for scheduling", "pod", klog.KObj(pod))
|
||||
fitError := &framework.FitError{
|
||||
Pod: pod,
|
||||
NumAllNodes: len(allNodes),
|
||||
|
17
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/framework.go
generated
vendored
17
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/framework.go
generated
vendored
@ -1011,7 +1011,7 @@ func (f *frameworkImpl) RunFilterPluginsWithNominatedPods(ctx context.Context, s
|
||||
nodeInfoToUse := info
|
||||
if i == 0 {
|
||||
var err error
|
||||
podsAdded, stateToUse, nodeInfoToUse, err = addNominatedPods(ctx, f, pod, state, info)
|
||||
podsAdded, stateToUse, nodeInfoToUse, err = addGENominatedPods(ctx, f, pod, state, info)
|
||||
if err != nil {
|
||||
return framework.AsStatus(err)
|
||||
}
|
||||
@ -1028,10 +1028,10 @@ func (f *frameworkImpl) RunFilterPluginsWithNominatedPods(ctx context.Context, s
|
||||
return status
|
||||
}
|
||||
|
||||
// addNominatedPods adds pods with equal or greater priority which are nominated
|
||||
// addGENominatedPods adds pods with equal or greater priority which are nominated
|
||||
// to run on the node. It returns 1) whether any pod was added, 2) augmented cycleState,
|
||||
// 3) augmented nodeInfo.
|
||||
func addNominatedPods(ctx context.Context, fh framework.Handle, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) {
|
||||
func addGENominatedPods(ctx context.Context, fh framework.Handle, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) {
|
||||
if fh == nil {
|
||||
// This may happen only in tests.
|
||||
return false, state, nodeInfo, nil
|
||||
@ -1137,7 +1137,8 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
|
||||
}
|
||||
// Run Score method for each node in parallel.
|
||||
f.Parallelizer().Until(ctx, len(nodes), func(index int) {
|
||||
nodeName := nodes[index].Node().Name
|
||||
nodeInfo := nodes[index]
|
||||
nodeName := nodeInfo.Node().Name
|
||||
logger := logger
|
||||
if verboseLogs {
|
||||
logger = klog.LoggerWithValues(logger, "node", klog.ObjectRef{Name: nodeName})
|
||||
@ -1148,7 +1149,7 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
|
||||
logger := klog.LoggerWithName(logger, pl.Name())
|
||||
ctx = klog.NewContext(ctx, logger)
|
||||
}
|
||||
s, status := f.runScorePlugin(ctx, pl, state, pod, nodeName)
|
||||
s, status := f.runScorePlugin(ctx, pl, state, pod, nodeInfo)
|
||||
if !status.IsSuccess() {
|
||||
err := fmt.Errorf("plugin %q failed with: %w", pl.Name(), status.AsError())
|
||||
errCh.SendErrorWithCancel(err, cancel)
|
||||
@ -1217,12 +1218,12 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
|
||||
return allNodePluginScores, nil
|
||||
}
|
||||
|
||||
func (f *frameworkImpl) runScorePlugin(ctx context.Context, pl framework.ScorePlugin, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
func (f *frameworkImpl) runScorePlugin(ctx context.Context, pl framework.ScorePlugin, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
if !state.ShouldRecordPluginMetrics() {
|
||||
return pl.Score(ctx, state, pod, nodeName)
|
||||
return pl.Score(ctx, state, pod, nodeInfo)
|
||||
}
|
||||
startTime := time.Now()
|
||||
s, status := pl.Score(ctx, state, pod, nodeName)
|
||||
s, status := pl.Score(ctx, state, pod, nodeInfo)
|
||||
f.metricsRecorder.ObservePluginDurationAsync(metrics.Score, pl.Name(), status.Code().String(), metrics.SinceInSeconds(startTime))
|
||||
return s, status
|
||||
}
|
||||
|
4
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/instrumented_plugins.go
generated
vendored
4
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/runtime/instrumented_plugins.go
generated
vendored
@ -77,7 +77,7 @@ type instrumentedScorePlugin struct {
|
||||
|
||||
var _ framework.ScorePlugin = &instrumentedScorePlugin{}
|
||||
|
||||
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
|
||||
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
|
||||
p.metric.Inc()
|
||||
return p.ScorePlugin.Score(ctx, state, pod, nodeName)
|
||||
return p.ScorePlugin.Score(ctx, state, pod, nodeInfo)
|
||||
}
|
||||
|
106
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/types.go
generated
vendored
106
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/framework/types.go
generated
vendored
@ -72,9 +72,9 @@ const (
|
||||
UpdatePodLabel
|
||||
// UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced).
|
||||
UpdatePodScaleDown
|
||||
// UpdatePodTolerations is an addition for pod's tolerations.
|
||||
// UpdatePodToleration is an addition for pod's tolerations.
|
||||
// (Due to API validation, we can add, but cannot modify or remove tolerations.)
|
||||
UpdatePodTolerations
|
||||
UpdatePodToleration
|
||||
// UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod.
|
||||
UpdatePodSchedulingGatesEliminated
|
||||
// UpdatePodGeneratedResourceClaim is an update of the list of ResourceClaims generated for the pod.
|
||||
@ -88,7 +88,7 @@ const (
|
||||
All ActionType = 1<<iota - 1
|
||||
|
||||
// Use the general Update type if you don't either know or care the specific sub-Update type to use.
|
||||
Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation | UpdatePodLabel | UpdatePodScaleDown | UpdatePodTolerations | UpdatePodSchedulingGatesEliminated | UpdatePodGeneratedResourceClaim | updatePodOther
|
||||
Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation | UpdatePodLabel | UpdatePodScaleDown | UpdatePodToleration | UpdatePodSchedulingGatesEliminated | UpdatePodGeneratedResourceClaim | updatePodOther
|
||||
// none is a special ActionType that is only used internally.
|
||||
none ActionType = 0
|
||||
)
|
||||
@ -97,7 +97,7 @@ var (
|
||||
// basicActionTypes is a list of basicActionTypes ActionTypes.
|
||||
basicActionTypes = []ActionType{Add, Delete, Update}
|
||||
// podActionTypes is a list of ActionTypes that are only applicable for Pod events.
|
||||
podActionTypes = []ActionType{UpdatePodLabel, UpdatePodScaleDown, UpdatePodTolerations, UpdatePodSchedulingGatesEliminated, UpdatePodGeneratedResourceClaim}
|
||||
podActionTypes = []ActionType{UpdatePodLabel, UpdatePodScaleDown, UpdatePodToleration, UpdatePodSchedulingGatesEliminated, UpdatePodGeneratedResourceClaim}
|
||||
// nodeActionTypes is a list of ActionTypes that are only applicable for Node events.
|
||||
nodeActionTypes = []ActionType{UpdateNodeAllocatable, UpdateNodeLabel, UpdateNodeTaint, UpdateNodeCondition, UpdateNodeAnnotation}
|
||||
)
|
||||
@ -122,8 +122,8 @@ func (a ActionType) String() string {
|
||||
return "UpdatePodLabel"
|
||||
case UpdatePodScaleDown:
|
||||
return "UpdatePodScaleDown"
|
||||
case UpdatePodTolerations:
|
||||
return "UpdatePodTolerations"
|
||||
case UpdatePodToleration:
|
||||
return "UpdatePodToleration"
|
||||
case UpdatePodSchedulingGatesEliminated:
|
||||
return "UpdatePodSchedulingGatesEliminated"
|
||||
case UpdatePodGeneratedResourceClaim:
|
||||
@ -366,6 +366,11 @@ type QueuedPodInfo struct {
|
||||
// Number of schedule attempts before successfully scheduled.
|
||||
// It's used to record the # attempts metric and calculate the backoff time this Pod is obliged to get before retrying.
|
||||
Attempts int
|
||||
// BackoffExpiration is the time when the Pod will complete its backoff.
|
||||
// If the SchedulerPopFromBackoffQ feature is enabled, the value is aligned to the backoff ordering window.
|
||||
// Then, two Pods with the same BackoffExpiration (time bucket) are ordered by priority and eventually the timestamp,
|
||||
// to make sure popping from the backoffQ considers priority of pods that are close to the expiration time.
|
||||
BackoffExpiration time.Time
|
||||
// The time when the pod is added to the queue for the first time. The pod may be added
|
||||
// back to the queue multiple times before it's successfully scheduled.
|
||||
// It shouldn't be updated once initialized. It's used to record the e2e scheduling
|
||||
@ -397,6 +402,13 @@ func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// podResource contains the result of calculateResource and is used only internally.
|
||||
type podResource struct {
|
||||
resource Resource
|
||||
non0CPU int64
|
||||
non0Mem int64
|
||||
}
|
||||
|
||||
// PodInfo is a wrapper to a Pod with additional pre-computed information to
|
||||
// accelerate processing. This information is typically immutable (e.g., pre-processed
|
||||
// inter-pod affinity selectors).
|
||||
@ -406,6 +418,15 @@ type PodInfo struct {
|
||||
RequiredAntiAffinityTerms []AffinityTerm
|
||||
PreferredAffinityTerms []WeightedAffinityTerm
|
||||
PreferredAntiAffinityTerms []WeightedAffinityTerm
|
||||
// cachedResource contains precomputed resources for Pod (podResource).
|
||||
// The value can change only if InPlacePodVerticalScaling is enabled.
|
||||
// In that case, the whole PodInfo object is recreated (for assigned pods in cache).
|
||||
// cachedResource contains a podResource, computed when adding a scheduled pod to NodeInfo.
|
||||
// When removing a pod from a NodeInfo, i.e. finding victims for preemption or removing a pod from a cluster,
|
||||
// cachedResource is used instead, what provides a noticeable performance boost.
|
||||
// Note: cachedResource field shouldn't be accessed directly.
|
||||
// Use calculateResource method to obtain it instead.
|
||||
cachedResource *podResource
|
||||
}
|
||||
|
||||
// DeepCopy returns a deep copy of the PodInfo object.
|
||||
@ -416,6 +437,7 @@ func (pi *PodInfo) DeepCopy() *PodInfo {
|
||||
RequiredAntiAffinityTerms: pi.RequiredAntiAffinityTerms,
|
||||
PreferredAffinityTerms: pi.PreferredAffinityTerms,
|
||||
PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms,
|
||||
cachedResource: pi.cachedResource,
|
||||
}
|
||||
}
|
||||
|
||||
@ -464,6 +486,7 @@ func (pi *PodInfo) Update(pod *v1.Pod) error {
|
||||
pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms
|
||||
pi.PreferredAffinityTerms = weightedAffinityTerms
|
||||
pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms
|
||||
pi.cachedResource = nil
|
||||
return utilerrors.NewAggregate(parseErrs)
|
||||
}
|
||||
|
||||
@ -963,7 +986,7 @@ func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) {
|
||||
if podWithRequiredAntiAffinity(podInfo.Pod) {
|
||||
n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo)
|
||||
}
|
||||
n.update(podInfo.Pod, 1)
|
||||
n.update(podInfo, 1)
|
||||
}
|
||||
|
||||
// AddPod is a wrapper around AddPodInfo.
|
||||
@ -985,8 +1008,8 @@ func podWithRequiredAntiAffinity(p *v1.Pod) bool {
|
||||
len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0
|
||||
}
|
||||
|
||||
func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bool) {
|
||||
var removed bool
|
||||
func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, *PodInfo) {
|
||||
var removedPod *PodInfo
|
||||
for i := range s {
|
||||
tmpKey, err := GetPodKey(s[i].Pod)
|
||||
if err != nil {
|
||||
@ -994,18 +1017,18 @@ func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bo
|
||||
continue
|
||||
}
|
||||
if k == tmpKey {
|
||||
removedPod = s[i]
|
||||
// delete the element
|
||||
s[i] = s[len(s)-1]
|
||||
s = s[:len(s)-1]
|
||||
removed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// resets the slices to nil so that we can do DeepEqual in unit tests.
|
||||
if len(s) == 0 {
|
||||
return nil, removed
|
||||
return nil, removedPod
|
||||
}
|
||||
return s, removed
|
||||
return s, removedPod
|
||||
}
|
||||
|
||||
// RemovePod subtracts pod information from this NodeInfo.
|
||||
@ -1021,33 +1044,33 @@ func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error {
|
||||
n.PodsWithRequiredAntiAffinity, _ = removeFromSlice(logger, n.PodsWithRequiredAntiAffinity, k)
|
||||
}
|
||||
|
||||
var removed bool
|
||||
if n.Pods, removed = removeFromSlice(logger, n.Pods, k); removed {
|
||||
n.update(pod, -1)
|
||||
var removedPod *PodInfo
|
||||
if n.Pods, removedPod = removeFromSlice(logger, n.Pods, k); removedPod != nil {
|
||||
n.update(removedPod, -1)
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name)
|
||||
}
|
||||
|
||||
// update node info based on the pod and sign.
|
||||
// update node info based on the pod, and sign.
|
||||
// The sign will be set to `+1` when AddPod and to `-1` when RemovePod.
|
||||
func (n *NodeInfo) update(pod *v1.Pod, sign int64) {
|
||||
res, non0CPU, non0Mem := calculateResource(pod)
|
||||
n.Requested.MilliCPU += sign * res.MilliCPU
|
||||
n.Requested.Memory += sign * res.Memory
|
||||
n.Requested.EphemeralStorage += sign * res.EphemeralStorage
|
||||
if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 {
|
||||
func (n *NodeInfo) update(podInfo *PodInfo, sign int64) {
|
||||
podResource := podInfo.calculateResource()
|
||||
n.Requested.MilliCPU += sign * podResource.resource.MilliCPU
|
||||
n.Requested.Memory += sign * podResource.resource.Memory
|
||||
n.Requested.EphemeralStorage += sign * podResource.resource.EphemeralStorage
|
||||
if n.Requested.ScalarResources == nil && len(podResource.resource.ScalarResources) > 0 {
|
||||
n.Requested.ScalarResources = map[v1.ResourceName]int64{}
|
||||
}
|
||||
for rName, rQuant := range res.ScalarResources {
|
||||
for rName, rQuant := range podResource.resource.ScalarResources {
|
||||
n.Requested.ScalarResources[rName] += sign * rQuant
|
||||
}
|
||||
n.NonZeroRequested.MilliCPU += sign * non0CPU
|
||||
n.NonZeroRequested.Memory += sign * non0Mem
|
||||
n.NonZeroRequested.MilliCPU += sign * podResource.non0CPU
|
||||
n.NonZeroRequested.Memory += sign * podResource.non0Mem
|
||||
|
||||
// Consume ports when pod added or release ports when pod removed.
|
||||
n.updateUsedPorts(pod, sign > 0)
|
||||
n.updatePVCRefCounts(pod, sign > 0)
|
||||
n.updateUsedPorts(podInfo.Pod, sign > 0)
|
||||
n.updatePVCRefCounts(podInfo.Pod, sign > 0)
|
||||
|
||||
n.Generation = nextGeneration()
|
||||
}
|
||||
@ -1103,20 +1126,25 @@ func getNonMissingContainerRequests(requests v1.ResourceList, podLevelResourcesS
|
||||
|
||||
}
|
||||
|
||||
func calculateResource(pod *v1.Pod) (Resource, int64, int64) {
|
||||
requests := resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
func (pi *PodInfo) calculateResource() podResource {
|
||||
if pi.cachedResource != nil {
|
||||
return *pi.cachedResource
|
||||
}
|
||||
inPlacePodVerticalScalingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling)
|
||||
podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources)
|
||||
requests := resourcehelper.PodRequests(pi.Pod, resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: inPlacePodVerticalScalingEnabled,
|
||||
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
||||
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
SkipPodLevelResources: !podLevelResourcesEnabled,
|
||||
})
|
||||
isPodLevelResourcesSet := utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources) && resourcehelper.IsPodLevelRequestsSet(pod)
|
||||
isPodLevelResourcesSet := podLevelResourcesEnabled && resourcehelper.IsPodLevelRequestsSet(pi.Pod)
|
||||
nonMissingContainerRequests := getNonMissingContainerRequests(requests, isPodLevelResourcesSet)
|
||||
non0Requests := requests
|
||||
if len(nonMissingContainerRequests) > 0 {
|
||||
non0Requests = resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
|
||||
non0Requests = resourcehelper.PodRequests(pi.Pod, resourcehelper.PodResourcesOptions{
|
||||
UseStatusResources: inPlacePodVerticalScalingEnabled,
|
||||
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
|
||||
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
|
||||
SkipPodLevelResources: !podLevelResourcesEnabled,
|
||||
NonMissingContainerRequests: nonMissingContainerRequests,
|
||||
})
|
||||
}
|
||||
@ -1125,7 +1153,13 @@ func calculateResource(pod *v1.Pod) (Resource, int64, int64) {
|
||||
|
||||
var res Resource
|
||||
res.Add(requests)
|
||||
return res, non0CPU.MilliValue(), non0Mem.Value()
|
||||
podResource := podResource{
|
||||
resource: res,
|
||||
non0CPU: non0CPU.MilliValue(),
|
||||
non0Mem: non0Mem.Value(),
|
||||
}
|
||||
pi.cachedResource = &podResource
|
||||
return podResource
|
||||
}
|
||||
|
||||
// updateUsedPorts updates the UsedPorts of NodeInfo.
|
||||
|
37
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/metrics/metrics.go
generated
vendored
37
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/metrics/metrics.go
generated
vendored
@ -102,16 +102,16 @@ var (
|
||||
InFlightEvents *metrics.GaugeVec
|
||||
Goroutines *metrics.GaugeVec
|
||||
|
||||
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
|
||||
// in v1.31. Please use PodSchedulingSLIDuration instead.
|
||||
PodSchedulingDuration *metrics.HistogramVec
|
||||
PodSchedulingSLIDuration *metrics.HistogramVec
|
||||
PodSchedulingAttempts *metrics.Histogram
|
||||
FrameworkExtensionPointDuration *metrics.HistogramVec
|
||||
PluginExecutionDuration *metrics.HistogramVec
|
||||
|
||||
PermitWaitDuration *metrics.HistogramVec
|
||||
CacheSize *metrics.GaugeVec
|
||||
PermitWaitDuration *metrics.HistogramVec
|
||||
CacheSize *metrics.GaugeVec
|
||||
// Deprecated: SchedulerCacheSize is deprecated,
|
||||
// and will be removed at v1.34. Please use CacheSize instead.
|
||||
SchedulerCacheSize *metrics.GaugeVec
|
||||
unschedulableReasons *metrics.GaugeVec
|
||||
PluginEvaluationTotal *metrics.CounterVec
|
||||
|
||||
@ -220,20 +220,6 @@ func InitMetrics() {
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
}, []string{"operation"})
|
||||
|
||||
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
|
||||
// in v1.31. Please use PodSchedulingSLIDuration instead.
|
||||
PodSchedulingDuration = metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
Name: "pod_scheduling_duration_seconds",
|
||||
Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
|
||||
// Start with 10ms with the last bucket being [~88m, Inf).
|
||||
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
|
||||
StabilityLevel: metrics.STABLE,
|
||||
DeprecatedVersion: "1.29.0",
|
||||
},
|
||||
[]string{"attempts"})
|
||||
|
||||
PodSchedulingSLIDuration = metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
@ -308,10 +294,19 @@ func InitMetrics() {
|
||||
},
|
||||
[]string{"result"})
|
||||
|
||||
SchedulerCacheSize = metrics.NewGaugeVec(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
Name: "scheduler_cache_size",
|
||||
Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
DeprecatedVersion: "1.33.0",
|
||||
}, []string{"type"})
|
||||
|
||||
CacheSize = metrics.NewGaugeVec(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
Name: "scheduler_cache_size",
|
||||
Name: "cache_size",
|
||||
Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
}, []string{"type"})
|
||||
@ -359,7 +354,6 @@ func InitMetrics() {
|
||||
PreemptionVictims,
|
||||
PreemptionAttempts,
|
||||
pendingPods,
|
||||
PodSchedulingDuration,
|
||||
PodSchedulingSLIDuration,
|
||||
PodSchedulingAttempts,
|
||||
FrameworkExtensionPointDuration,
|
||||
@ -368,6 +362,7 @@ func InitMetrics() {
|
||||
Goroutines,
|
||||
PermitWaitDuration,
|
||||
CacheSize,
|
||||
SchedulerCacheSize,
|
||||
unschedulableReasons,
|
||||
PluginEvaluationTotal,
|
||||
}
|
||||
|
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/profile/profile.go
generated
vendored
2
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/profile/profile.go
generated
vendored
@ -22,7 +22,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp" //nolint:depguard
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
|
37
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/schedule_one.go
generated
vendored
37
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/schedule_one.go
generated
vendored
@ -292,30 +292,19 @@ func (sched *Scheduler) bindingCycle(
|
||||
return status
|
||||
}
|
||||
|
||||
// Run "prebind" plugins.
|
||||
if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
|
||||
if status.IsRejected() {
|
||||
fitErr := &framework.FitError{
|
||||
NumAllNodes: 1,
|
||||
Pod: assumedPodInfo.Pod,
|
||||
Diagnosis: framework.Diagnosis{
|
||||
NodeToStatus: framework.NewDefaultNodeToStatus(),
|
||||
UnschedulablePlugins: sets.New(status.Plugin()),
|
||||
},
|
||||
}
|
||||
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, status)
|
||||
return framework.NewStatus(status.Code()).WithError(fitErr)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
// Any failures after this point cannot lead to the Pod being considered unschedulable.
|
||||
// We define the Pod as "unschedulable" only when Pods are rejected at specific extension points, and PreBind is the last one in the scheduling/binding cycle.
|
||||
// We define the Pod as "unschedulable" only when Pods are rejected at specific extension points, and Permit is the last one in the scheduling/binding cycle.
|
||||
// If a Pod fails on PreBind or Bind, it should be moved to BackoffQ for retry.
|
||||
//
|
||||
// We can call Done() here because
|
||||
// we can free the cluster events stored in the scheduling queue sonner, which is worth for busy clusters memory consumption wise.
|
||||
// we can free the cluster events stored in the scheduling queue sooner, which is worth for busy clusters memory consumption wise.
|
||||
sched.SchedulingQueue.Done(assumedPod.UID)
|
||||
|
||||
// Run "prebind" plugins.
|
||||
if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
|
||||
return status
|
||||
}
|
||||
|
||||
// Run "bind" plugins.
|
||||
if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() {
|
||||
return status
|
||||
@ -326,7 +315,6 @@ func (sched *Scheduler) bindingCycle(
|
||||
metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
|
||||
metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts))
|
||||
if assumedPodInfo.InitialAttemptTimestamp != nil {
|
||||
metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
|
||||
metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
|
||||
}
|
||||
// Run "postbind" plugins.
|
||||
@ -1098,10 +1086,11 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo
|
||||
msg := truncateMessage(errMsg)
|
||||
fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)
|
||||
if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{
|
||||
Type: v1.PodScheduled,
|
||||
Status: v1.ConditionFalse,
|
||||
Reason: reason,
|
||||
Message: errMsg,
|
||||
Type: v1.PodScheduled,
|
||||
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(&pod.Status, pod.Generation, v1.PodScheduled),
|
||||
Status: v1.ConditionFalse,
|
||||
Reason: reason,
|
||||
Message: errMsg,
|
||||
}, nominatingInfo); err != nil {
|
||||
logger.Error(err, "Error updating pod", "pod", klog.KObj(pod))
|
||||
}
|
||||
|
32
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/scheduler.go
generated
vendored
32
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/scheduler.go
generated
vendored
@ -33,6 +33,7 @@ import (
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
restclient "k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/cache"
|
||||
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
|
||||
"k8s.io/klog/v2"
|
||||
configv1 "k8s.io/kube-scheduler/config/v1"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
@ -50,6 +51,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
||||
"k8s.io/kubernetes/pkg/scheduler/profile"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
|
||||
"k8s.io/utils/clock"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -116,6 +118,7 @@ func (sched *Scheduler) applyDefaultHandlers() {
|
||||
}
|
||||
|
||||
type schedulerOptions struct {
|
||||
clock clock.WithTicker
|
||||
componentConfigVersion string
|
||||
kubeConfig *restclient.Config
|
||||
// Overridden by profile level percentageOfNodesToScore if set in v1.
|
||||
@ -227,6 +230,13 @@ func WithExtenders(e ...schedulerapi.Extender) Option {
|
||||
}
|
||||
}
|
||||
|
||||
// WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
|
||||
func WithClock(clock clock.WithTicker) Option {
|
||||
return func(o *schedulerOptions) {
|
||||
o.clock = clock
|
||||
}
|
||||
}
|
||||
|
||||
// FrameworkCapturer is used for registering a notify function in building framework.
|
||||
type FrameworkCapturer func(schedulerapi.KubeSchedulerProfile)
|
||||
|
||||
@ -238,6 +248,7 @@ func WithBuildFrameworkCapturer(fc FrameworkCapturer) Option {
|
||||
}
|
||||
|
||||
var defaultSchedulerOptions = schedulerOptions{
|
||||
clock: clock.RealClock{},
|
||||
percentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore,
|
||||
podInitialBackoffSeconds: int64(internalqueue.DefaultPodInitialBackoffDuration.Seconds()),
|
||||
podMaxBackoffSeconds: int64(internalqueue.DefaultPodMaxBackoffDuration.Seconds()),
|
||||
@ -297,11 +308,27 @@ func New(ctx context.Context,
|
||||
waitingPods := frameworkruntime.NewWaitingPodsMap()
|
||||
|
||||
var resourceClaimCache *assumecache.AssumeCache
|
||||
var resourceSliceTracker *resourceslicetracker.Tracker
|
||||
var draManager framework.SharedDRAManager
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
|
||||
resourceClaimInformer := informerFactory.Resource().V1beta1().ResourceClaims().Informer()
|
||||
resourceClaimCache = assumecache.NewAssumeCache(logger, resourceClaimInformer, "ResourceClaim", "", nil)
|
||||
draManager = dynamicresources.NewDRAManager(ctx, resourceClaimCache, informerFactory)
|
||||
resourceSliceTrackerOpts := resourceslicetracker.Options{
|
||||
EnableDeviceTaints: utilfeature.DefaultFeatureGate.Enabled(features.DRADeviceTaints),
|
||||
SliceInformer: informerFactory.Resource().V1beta1().ResourceSlices(),
|
||||
KubeClient: client,
|
||||
}
|
||||
// If device taints are disabled, the additional informers are not needed and
|
||||
// the tracker turns into a simple wrapper around the slice informer.
|
||||
if resourceSliceTrackerOpts.EnableDeviceTaints {
|
||||
resourceSliceTrackerOpts.TaintInformer = informerFactory.Resource().V1alpha3().DeviceTaintRules()
|
||||
resourceSliceTrackerOpts.ClassInformer = informerFactory.Resource().V1beta1().DeviceClasses()
|
||||
}
|
||||
resourceSliceTracker, err = resourceslicetracker.StartTracker(ctx, resourceSliceTrackerOpts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't start resource slice tracker: %w", err)
|
||||
}
|
||||
draManager = dynamicresources.NewDRAManager(ctx, resourceClaimCache, resourceSliceTracker, informerFactory)
|
||||
}
|
||||
|
||||
profiles, err := profile.NewMap(ctx, options.profiles, registry, recorderFactory,
|
||||
@ -343,6 +370,7 @@ func New(ctx context.Context,
|
||||
podQueue := internalqueue.NewSchedulingQueue(
|
||||
profiles[options.profiles[0].SchedulerName].QueueSortFunc(),
|
||||
informerFactory,
|
||||
internalqueue.WithClock(options.clock),
|
||||
internalqueue.WithPodInitialBackoffDuration(time.Duration(options.podInitialBackoffSeconds)*time.Second),
|
||||
internalqueue.WithPodMaxBackoffDuration(time.Duration(options.podMaxBackoffSeconds)*time.Second),
|
||||
internalqueue.WithPodLister(podLister),
|
||||
@ -378,7 +406,7 @@ func New(ctx context.Context,
|
||||
sched.NextPod = podQueue.Pop
|
||||
sched.applyDefaultHandlers()
|
||||
|
||||
if err = addAllEventHandlers(sched, informerFactory, dynInformerFactory, resourceClaimCache, unionedGVKs(queueingHintsPerProfile)); err != nil {
|
||||
if err = addAllEventHandlers(sched, informerFactory, dynInformerFactory, resourceClaimCache, resourceSliceTracker, unionedGVKs(queueingHintsPerProfile)); err != nil {
|
||||
return nil, fmt.Errorf("adding event handlers: %w", err)
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user