rebase: update replaced k8s.io modules to v0.33.0

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-05-07 13:13:33 +02:00
committed by mergify[bot]
parent dd77e72800
commit 107407b44b
1723 changed files with 65035 additions and 175239 deletions

View File

@ -17,4 +17,4 @@ limitations under the License.
// +k8s:deepcopy-gen=package
// +groupName=kubescheduler.config.k8s.io
package config // import "k8s.io/kubernetes/pkg/scheduler/apis/config"
package config

View File

@ -163,7 +163,7 @@ type VolumeBindingArgs struct {
// 1) 0 for 0 utilization
// 2) 10 for 100 utilization
// All points must be sorted in increasing order by utilization.
// +featureGate=VolumeCapacityPriority
// +featureGate=StorageCapacityScoring
// +optional
Shape []UtilizationShapePoint
}

View File

@ -192,15 +192,15 @@ func SetDefaults_VolumeBindingArgs(obj *configv1.VolumeBindingArgs) {
if obj.BindTimeoutSeconds == nil {
obj.BindTimeoutSeconds = ptr.To[int64](600)
}
if len(obj.Shape) == 0 && feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority) {
if len(obj.Shape) == 0 && feature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring) {
obj.Shape = []configv1.UtilizationShapePoint{
{
Utilization: 0,
Score: 0,
Score: int32(config.MaxCustomPriorityScore),
},
{
Utilization: 100,
Score: int32(config.MaxCustomPriorityScore),
Score: 0,
},
}
}

View File

@ -21,4 +21,4 @@ limitations under the License.
// +k8s:defaulter-gen-input=k8s.io/kube-scheduler/config/v1
// +groupName=kubescheduler.config.k8s.io
package v1 // import "k8s.io/kubernetes/pkg/scheduler/apis/config/v1"
package v1

View File

@ -261,13 +261,13 @@ func ValidateNodeAffinityArgs(path *field.Path, args *config.NodeAffinityArgs) e
// VolumeBindingArgsValidationOptions contains the different settings for validation.
type VolumeBindingArgsValidationOptions struct {
AllowVolumeCapacityPriority bool
AllowStorageCapacityScoring bool
}
// ValidateVolumeBindingArgs validates that VolumeBindingArgs are set correctly.
func ValidateVolumeBindingArgs(path *field.Path, args *config.VolumeBindingArgs) error {
return ValidateVolumeBindingArgsWithOptions(path, args, VolumeBindingArgsValidationOptions{
AllowVolumeCapacityPriority: utilfeature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
AllowStorageCapacityScoring: utilfeature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring),
})
}
@ -279,13 +279,13 @@ func ValidateVolumeBindingArgsWithOptions(path *field.Path, args *config.VolumeB
allErrs = append(allErrs, field.Invalid(path.Child("bindTimeoutSeconds"), args.BindTimeoutSeconds, "invalid BindTimeoutSeconds, should not be a negative value"))
}
if opts.AllowVolumeCapacityPriority {
if opts.AllowStorageCapacityScoring {
allErrs = append(allErrs, validateFunctionShape(args.Shape, path.Child("shape"))...)
} else if args.Shape != nil {
// When the feature is off, return an error if the config is not nil.
// This prevents unexpected configuration from taking effect when the
// feature turns on in the future.
allErrs = append(allErrs, field.Invalid(path.Child("shape"), args.Shape, "unexpected field `shape`, remove it or turn on the feature gate VolumeCapacityPriority"))
allErrs = append(allErrs, field.Invalid(path.Child("shape"), args.Shape, "unexpected field `shape`, remove it or turn on the feature gate StorageCapacityScoring"))
}
return allErrs.ToAggregate()
}

View File

@ -757,4 +757,12 @@ func (cache *cacheImpl) updateMetrics() {
metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
// we intentionally keep them with the deprecation and will remove at v1.34.
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
}

View File

@ -20,6 +20,7 @@ import (
"container/list"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
@ -61,14 +62,63 @@ type activeQueuer interface {
// underLock() method should be used to protect these methods.
type unlockedActiveQueuer interface {
unlockedActiveQueueReader
AddOrUpdate(pInfo *framework.QueuedPodInfo)
// add adds a new pod to the activeQ.
// The event should show which event triggered this addition and is used for the metric recording.
// This method should be called in activeQueue.underLock().
add(pInfo *framework.QueuedPodInfo, event string)
}
// unlockedActiveQueueReader defines activeQ read-only methods that are not protected by the lock itself.
// underLock() or underRLock() method should be used to protect these methods.
type unlockedActiveQueueReader interface {
Get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
Has(pInfo *framework.QueuedPodInfo) bool
// get returns the pod matching pInfo inside the activeQ.
// Returns false if the pInfo doesn't exist in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
// has returns if pInfo exists in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
has(pInfo *framework.QueuedPodInfo) bool
}
// unlockedActiveQueue defines activeQ methods that are not protected by the lock itself.
// activeQueue.underLock() or activeQueue.underRLock() method should be used to protect these methods.
type unlockedActiveQueue struct {
queue *heap.Heap[*framework.QueuedPodInfo]
}
func newUnlockedActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo]) *unlockedActiveQueue {
return &unlockedActiveQueue{
queue: queue,
}
}
// add adds a new pod to the activeQ.
// The event should show which event triggered this addition and is used for the metric recording.
// This method should be called in activeQueue.underLock().
func (uaq *unlockedActiveQueue) add(pInfo *framework.QueuedPodInfo, event string) {
uaq.queue.AddOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
}
// get returns the pod matching pInfo inside the activeQ.
// Returns false if the pInfo doesn't exist in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
func (uaq *unlockedActiveQueue) get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
return uaq.queue.Get(pInfo)
}
// has returns if pInfo exists in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
func (uaq *unlockedActiveQueue) has(pInfo *framework.QueuedPodInfo) bool {
return uaq.queue.Has(pInfo)
}
// backoffQPopper defines method that is used to pop from the backoffQ when the activeQ is empty.
type backoffQPopper interface {
// popBackoff pops the pInfo from the podBackoffQ.
popBackoff() (*framework.QueuedPodInfo, error)
// len returns length of the podBackoffQ queue.
lenBackoff() int
}
// activeQueue implements activeQueuer. All of the fields have to be protected using the lock.
@ -77,15 +127,21 @@ type activeQueue struct {
// It protects activeQ, inFlightPods, inFlightEvents, schedulingCycle and closed fields.
// Caution: DO NOT take "SchedulingQueue.lock" after taking "lock".
// You should always take "SchedulingQueue.lock" first, otherwise the queue could end up in deadlock.
// "lock" should not be taken after taking "nLock".
// Correct locking order is: SchedulingQueue.lock > lock > nominator.nLock.
// "lock" should not be taken after taking "backoffQueue.lock" or "nominator.nLock".
// Correct locking order is: SchedulingQueue.lock > lock > backoffQueue.lock > nominator.nLock.
lock sync.RWMutex
// activeQ is heap structure that scheduler actively looks at to find pods to
// schedule. Head of heap is the highest priority pod.
queue *heap.Heap[*framework.QueuedPodInfo]
// unlockedQueue is a wrapper of queue providing methods that are not locked themselves
// and can be used in the underLock() or underRLock().
unlockedQueue *unlockedActiveQueue
// cond is a condition that is notified when the pod is added to activeQ.
// When SchedulerPopFromBackoffQ feature is enabled,
// condition is also notified when the pod is added to backoffQ.
// It is used with lock.
cond sync.Cond
@ -125,15 +181,21 @@ type activeQueue struct {
isSchedulingQueueHintEnabled bool
metricsRecorder metrics.MetricAsyncRecorder
// backoffQPopper is used to pop from backoffQ when activeQ is empty.
// It is non-nil only when SchedulerPopFromBackoffQ feature is enabled.
backoffQPopper backoffQPopper
}
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder) *activeQueue {
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder, backoffQPopper backoffQPopper) *activeQueue {
aq := &activeQueue{
queue: queue,
inFlightPods: make(map[types.UID]*list.Element),
inFlightEvents: list.New(),
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
metricsRecorder: metricRecorder,
unlockedQueue: newUnlockedActiveQueue(queue),
backoffQPopper: backoffQPopper,
}
aq.cond.L = &aq.lock
@ -146,7 +208,7 @@ func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueu
func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer)) {
aq.lock.Lock()
defer aq.lock.Unlock()
fn(aq.queue)
fn(aq.unlockedQueue)
}
// underLock runs the fn function under the lock.RLock.
@ -155,7 +217,7 @@ func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer))
func (aq *activeQueue) underRLock(fn func(unlockedActiveQ unlockedActiveQueueReader)) {
aq.lock.RLock()
defer aq.lock.RUnlock()
fn(aq.queue)
fn(aq.unlockedQueue)
}
// update updates the pod in activeQ if oldPodInfo is already in the queue.
@ -191,7 +253,13 @@ func (aq *activeQueue) pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
}
func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
var pInfo *framework.QueuedPodInfo
for aq.queue.Len() == 0 {
// backoffQPopper is non-nil only if SchedulerPopFromBackoffQ feature is enabled.
// In case of non-empty backoffQ, try popping from there.
if aq.backoffQPopper != nil && aq.backoffQPopper.lenBackoff() != 0 {
break
}
// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
// When Close() is called, the p.closed is set and the condition is broadcast,
// which causes this loop to continue and return from the Pop().
@ -203,9 +271,18 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo
}
pInfo, err := aq.queue.Pop()
if err != nil {
return nil, err
if aq.backoffQPopper == nil {
return nil, err
}
// Try to pop from backoffQ when activeQ is empty.
pInfo, err = aq.backoffQPopper.popBackoff()
if err != nil {
return nil, err
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", framework.PopFromBackoffQ).Inc()
}
pInfo.Attempts++
pInfo.BackoffExpiration = time.Time{}
// In flight, no concurrent events yet.
if aq.isSchedulingQueueHintEnabled {
// If the pod is already in the map, we shouldn't overwrite the inFlightPods otherwise it'd lead to a memory leak.
@ -354,6 +431,12 @@ func (aq *activeQueue) done(pod types.UID) {
aq.lock.Lock()
defer aq.lock.Unlock()
aq.unlockedDone(pod)
}
// unlockedDone is used by the activeQueue internally and doesn't take the lock itself.
// It assumes the lock is already taken outside before the method is called.
func (aq *activeQueue) unlockedDone(pod types.UID) {
inFlightPod, ok := aq.inFlightPods[pod]
if !ok {
// This Pod is already done()ed.
@ -398,15 +481,15 @@ func (aq *activeQueue) done(pod types.UID) {
// close closes the activeQueue.
func (aq *activeQueue) close() {
aq.lock.Lock()
defer aq.lock.Unlock()
// We should call done() for all in-flight pods to clean up the inFlightEvents metrics.
// It's safe even if the binding cycle running asynchronously calls done() afterwards
// done() will just be a no-op.
for pod := range aq.inFlightPods {
aq.done(pod)
aq.unlockedDone(pod)
}
aq.lock.Lock()
aq.closed = true
aq.lock.Unlock()
}
// broadcast notifies the pop() operation that new pod(s) was added to the activeQueue.

View File

@ -0,0 +1,405 @@
/*
Copyright 2025 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/backend/heap"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/utils/clock"
)
// backoffQOrderingWindowDuration is a duration of an ordering window in the podBackoffQ.
// In each window, represented as a whole second, pods are ordered by priority.
// It is the same as interval of flushing the pods from the podBackoffQ to the activeQ, to flush the whole windows there.
// This works only if PopFromBackoffQ feature is enabled.
// See the KEP-5142 (http://kep.k8s.io/5142) for rationale.
const backoffQOrderingWindowDuration = time.Second
// backoffQueuer is a wrapper for backoffQ related operations.
// Its methods that relies on the queues, take the lock inside.
type backoffQueuer interface {
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
// If the pod backoff time is in the actual ordering window, it should still be backing off.
isPodBackingoff(podInfo *framework.QueuedPodInfo) bool
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo
// podInitialBackoffDuration returns initial backoff duration that pod can get.
podInitialBackoffDuration() time.Duration
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
podMaxBackoffDuration() time.Duration
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
// and whole windows have to be flushed at one time without a visible latency.
waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{})
// add adds the pInfo to backoffQueue.
// The event should show which event triggered this addition and is used for the metric recording.
// It also ensures that pInfo is not in both queues.
add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string)
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
// It returns new pod info if updated, nil otherwise.
update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo
// delete deletes the pInfo from backoffQueue.
// It returns true if the pod was deleted.
delete(pInfo *framework.QueuedPodInfo) bool
// get returns the pInfo matching given pInfoLookup, if exists.
get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
// has inform if pInfo exists in the queue.
has(pInfo *framework.QueuedPodInfo) bool
// list returns all pods that are in the queue.
list() []*v1.Pod
// len returns length of the queue.
len() int
}
// backoffQueue implements backoffQueuer and wraps two queues inside,
// providing seamless access as if it were one queue.
type backoffQueue struct {
// lock synchronizes all operations related to backoffQ.
// It protects both podBackoffQ and podErrorBackoffQ.
// Caution: DO NOT take "SchedulingQueue.lock" or "activeQueue.lock" after taking "lock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first, otherwise the queue could end up in deadlock.
// "lock" should not be taken after taking "nominator.nLock".
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > lock > nominator.nLock.
lock sync.RWMutex
clock clock.WithTicker
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
// podErrorBackoffQ is a heap ordered by error backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podErrorBackoffQ *heap.Heap[*framework.QueuedPodInfo]
podInitialBackoff time.Duration
podMaxBackoff time.Duration
// activeQLessFn is used as an eventual less function if two backoff times are equal,
// when the SchedulerPopFromBackoffQ feature is enabled.
activeQLessFn framework.LessFunc
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
isPopFromBackoffQEnabled bool
}
func newBackoffQueue(clock clock.WithTicker, podInitialBackoffDuration time.Duration, podMaxBackoffDuration time.Duration, activeQLessFn framework.LessFunc, popFromBackoffQEnabled bool) *backoffQueue {
bq := &backoffQueue{
clock: clock,
podInitialBackoff: podInitialBackoffDuration,
podMaxBackoff: podMaxBackoffDuration,
isPopFromBackoffQEnabled: popFromBackoffQEnabled,
activeQLessFn: activeQLessFn,
}
podBackoffQLessFn := bq.lessBackoffCompleted
if popFromBackoffQEnabled {
podBackoffQLessFn = bq.lessBackoffCompletedWithPriority
}
bq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, podBackoffQLessFn, metrics.NewBackoffPodsRecorder())
bq.podErrorBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, bq.lessBackoffCompleted, metrics.NewBackoffPodsRecorder())
return bq
}
// podInitialBackoffDuration returns initial backoff duration that pod can get.
func (bq *backoffQueue) podInitialBackoffDuration() time.Duration {
return bq.podInitialBackoff
}
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
func (bq *backoffQueue) podMaxBackoffDuration() time.Duration {
return bq.podMaxBackoff
}
// alignToWindow truncates the provided time to the podBackoffQ ordering window.
// It returns the lowest possible timestamp in the window.
func (bq *backoffQueue) alignToWindow(t time.Time) time.Time {
if !bq.isPopFromBackoffQEnabled {
return t
}
return t.Truncate(backoffQOrderingWindowDuration)
}
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
// and whole windows have to be flushed at one time without a visible latency.
func (bq *backoffQueue) waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{}) {
now := bq.clock.Now()
// Wait until the time reaches the multiple of backoffQOrderingWindowDuration.
durationToNextWindow := bq.alignToWindow(now.Add(backoffQOrderingWindowDuration)).Sub(now)
timer := bq.clock.NewTimer(durationToNextWindow)
select {
case <-stopCh:
timer.Stop()
return
case <-timer.C():
}
// Run a ticker to make sure the invocations of f function
// are aligned with the backoffQ's ordering window.
ticker := bq.clock.NewTicker(backoffQOrderingWindowDuration)
for {
select {
case <-stopCh:
return
default:
}
f()
// NOTE: b/c there is no priority selection in golang
// it is possible for this to race, meaning we could
// trigger ticker.C and stopCh, and ticker.C select falls through.
// In order to mitigate we re-check stopCh at the beginning
// of every loop to prevent extra executions of f().
select {
case <-stopCh:
ticker.Stop()
return
case <-ticker.C():
}
}
}
// lessBackoffCompletedWithPriority is a less function of podBackoffQ if PopFromBackoffQ feature is enabled.
// It orders the pods in the same BackoffOrderingWindow the same as the activeQ will do to improve popping order from backoffQ when activeQ is empty.
func (bq *backoffQueue) lessBackoffCompletedWithPriority(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := bq.getBackoffTime(pInfo1)
bo2 := bq.getBackoffTime(pInfo2)
if !bo1.Equal(bo2) {
return bo1.Before(bo2)
}
// If the backoff time is the same, sort the pod in the same manner as activeQ does.
return bq.activeQLessFn(pInfo1, pInfo2)
}
// lessBackoffCompleted is a less function of podErrorBackoffQ.
func (bq *backoffQueue) lessBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := bq.getBackoffTime(pInfo1)
bo2 := bq.getBackoffTime(pInfo2)
return bo1.Before(bo2)
}
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
// If the pod backoff time is in the actual ordering window, it should still be backing off.
func (bq *backoffQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
boTime := bq.getBackoffTime(podInfo)
// Don't use After, because in case of windows equality we want to return true.
return !boTime.Before(bq.alignToWindow(bq.clock.Now()))
}
// getBackoffTime returns the time that podInfo completes backoff.
// It caches the result in podInfo.BackoffExpiration and returns this value in subsequent calls.
// The cache will be cleared when this pod is poped from the scheduling queue again (i.e., at activeQ's pop),
// because of the fact that the backoff time is calculated based on podInfo.Attempts,
// which doesn't get changed until the pod's scheduling is retried.
func (bq *backoffQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
if podInfo.Attempts == 0 {
// Don't store backoff expiration if the duration is 0
// to correctly handle isPodBackingoff, if pod should skip backoff, when it wasn't tried at all.
return time.Time{}
}
if podInfo.BackoffExpiration.IsZero() {
duration := bq.calculateBackoffDuration(podInfo)
podInfo.BackoffExpiration = bq.alignToWindow(podInfo.Timestamp.Add(duration))
}
return podInfo.BackoffExpiration
}
// calculateBackoffDuration is a helper function for calculating the backoffDuration
// based on the number of attempts the pod has made.
func (bq *backoffQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
if podInfo.Attempts == 0 {
// When the Pod hasn't experienced any scheduling attempts,
// they aren't obliged to get a backoff penalty at all.
return 0
}
duration := bq.podInitialBackoff
for i := 1; i < podInfo.Attempts; i++ {
// Use subtraction instead of addition or multiplication to avoid overflow.
if duration > bq.podMaxBackoff-duration {
return bq.podMaxBackoff
}
duration += duration
}
return duration
}
func (bq *backoffQueue) popAllBackoffCompletedWithQueue(logger klog.Logger, queue *heap.Heap[*framework.QueuedPodInfo]) []*framework.QueuedPodInfo {
var poppedPods []*framework.QueuedPodInfo
for {
pInfo, ok := queue.Peek()
if !ok || pInfo == nil {
break
}
pod := pInfo.Pod
if bq.isPodBackingoff(pInfo) {
break
}
_, err := queue.Pop()
if err != nil {
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
break
}
poppedPods = append(poppedPods, pInfo)
}
return poppedPods
}
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
func (bq *backoffQueue) popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo {
bq.lock.Lock()
defer bq.lock.Unlock()
// Ensure both queues are called
return append(bq.popAllBackoffCompletedWithQueue(logger, bq.podBackoffQ), bq.popAllBackoffCompletedWithQueue(logger, bq.podErrorBackoffQ)...)
}
// add adds the pInfo to backoffQueue.
// The event should show which event triggered this addition and is used for the metric recording.
// It also ensures that pInfo is not in both queues.
func (bq *backoffQueue) add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) {
bq.lock.Lock()
defer bq.lock.Unlock()
// If pod has empty both unschedulable plugins and pending plugins,
// it means that it failed because of error and should be moved to podErrorBackoffQ.
if pInfo.UnschedulablePlugins.Len() == 0 && pInfo.PendingPlugins.Len() == 0 {
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
// Ensure the pod is not in the podBackoffQ and report the error if it happens.
err := bq.podBackoffQ.Delete(pInfo)
if err == nil {
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podBackoffQ", "pod", klog.KObj(pInfo.Pod))
return
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
return
}
bq.podBackoffQ.AddOrUpdate(pInfo)
// Ensure the pod is not in the podErrorBackoffQ and report the error if it happens.
err := bq.podErrorBackoffQ.Delete(pInfo)
if err == nil {
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podErrorBackoffQ", "pod", klog.KObj(pInfo.Pod))
return
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
}
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
// It returns new pod info if updated, nil otherwise.
func (bq *backoffQueue) update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo {
bq.lock.Lock()
defer bq.lock.Unlock()
// If the pod is in the backoff queue, update it there.
if pInfo, exists := bq.podBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
bq.podBackoffQ.AddOrUpdate(pInfo)
return pInfo
}
// If the pod is in the error backoff queue, update it there.
if pInfo, exists := bq.podErrorBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
return pInfo
}
return nil
}
// delete deletes the pInfo from backoffQueue.
// It returns true if the pod was deleted.
func (bq *backoffQueue) delete(pInfo *framework.QueuedPodInfo) bool {
bq.lock.Lock()
defer bq.lock.Unlock()
if bq.podBackoffQ.Delete(pInfo) == nil {
return true
}
return bq.podErrorBackoffQ.Delete(pInfo) == nil
}
// popBackoff pops the pInfo from the podBackoffQ.
// It returns error if the queue is empty.
// This doesn't pop the pods from the podErrorBackoffQ.
func (bq *backoffQueue) popBackoff() (*framework.QueuedPodInfo, error) {
bq.lock.Lock()
defer bq.lock.Unlock()
return bq.podBackoffQ.Pop()
}
// get returns the pInfo matching given pInfoLookup, if exists.
func (bq *backoffQueue) get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
bq.lock.RLock()
defer bq.lock.RUnlock()
pInfo, exists := bq.podBackoffQ.Get(pInfoLookup)
if exists {
return pInfo, true
}
return bq.podErrorBackoffQ.Get(pInfoLookup)
}
// has inform if pInfo exists in the queue.
func (bq *backoffQueue) has(pInfo *framework.QueuedPodInfo) bool {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Has(pInfo) || bq.podErrorBackoffQ.Has(pInfo)
}
// list returns all pods that are in the queue.
func (bq *backoffQueue) list() []*v1.Pod {
bq.lock.RLock()
defer bq.lock.RUnlock()
var result []*v1.Pod
for _, pInfo := range bq.podBackoffQ.List() {
result = append(result, pInfo.Pod)
}
for _, pInfo := range bq.podErrorBackoffQ.List() {
result = append(result, pInfo.Pod)
}
return result
}
// len returns length of the queue.
func (bq *backoffQueue) len() int {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Len() + bq.podErrorBackoffQ.Len()
}
// lenBackoff returns length of the podBackoffQ.
func (bq *backoffQueue) lenBackoff() int {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Len()
}

View File

@ -35,10 +35,10 @@ import (
type nominator struct {
// nLock synchronizes all operations related to nominator.
// It should not be used anywhere else.
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock") after taking "nLock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first,
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock" or "backoffQueue.lock") after taking "nLock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" and "backoffQueue.lock" first,
// otherwise the nominator could end up in deadlock.
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > nLock.
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock = backoffQueue.lock > nLock.
nLock sync.RWMutex
// podLister is used to verify if the given pod is alive.

View File

@ -132,6 +132,9 @@ type SchedulingQueue interface {
PendingPods() ([]*v1.Pod, string)
InFlightPods() []*v1.Pod
PodsInActiveQ() []*v1.Pod
// PodsInBackoffQ returns all the Pods in the backoffQ.
PodsInBackoffQ() []*v1.Pod
UnschedulablePods() []*v1.Pod
}
// NewSchedulingQueue initializes a priority queue as a new scheduling queue.
@ -155,24 +158,18 @@ type PriorityQueue struct {
*nominator
stop chan struct{}
clock clock.Clock
clock clock.WithTicker
// lock takes precedence and should be taken first,
// before any other locks in the queue (activeQueue.lock or nominator.nLock).
// Correct locking order is: lock > activeQueue.lock > nominator.nLock.
// before any other locks in the queue (activeQueue.lock or backoffQueue.lock or nominator.nLock).
// Correct locking order is: lock > activeQueue.lock > backoffQueue.lock > nominator.nLock.
lock sync.RWMutex
// pod initial backoff duration.
podInitialBackoffDuration time.Duration
// pod maximum backoff duration.
podMaxBackoffDuration time.Duration
// the maximum time a pod can stay in the unschedulablePods.
podMaxInUnschedulablePodsDuration time.Duration
activeQ activeQueuer
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
activeQ activeQueuer
backoffQ backoffQueuer
// unschedulablePods holds pods that have been tried and determined unschedulable.
unschedulablePods *UnschedulablePods
// moveRequestCycle caches the sequence number of scheduling cycle when we
@ -195,6 +192,8 @@ type PriorityQueue struct {
// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
isSchedulingQueueHintEnabled bool
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
isPopFromBackoffQEnabled bool
}
// QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName.
@ -213,7 +212,7 @@ type clusterEvent struct {
}
type priorityQueueOptions struct {
clock clock.Clock
clock clock.WithTicker
podInitialBackoffDuration time.Duration
podMaxBackoffDuration time.Duration
podMaxInUnschedulablePodsDuration time.Duration
@ -228,7 +227,7 @@ type priorityQueueOptions struct {
type Option func(*priorityQueueOptions)
// WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
func WithClock(clock clock.Clock) Option {
func WithClock(clock clock.WithTicker) Option {
return func(o *priorityQueueOptions) {
o.clock = clock
}
@ -331,14 +330,14 @@ func NewPriorityQueue(
}
isSchedulingQueueHintEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints)
isPopFromBackoffQEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerPopFromBackoffQ)
backoffQ := newBackoffQueue(options.clock, options.podInitialBackoffDuration, options.podMaxBackoffDuration, lessFn, isPopFromBackoffQEnabled)
pq := &PriorityQueue{
clock: options.clock,
stop: make(chan struct{}),
podInitialBackoffDuration: options.podInitialBackoffDuration,
podMaxBackoffDuration: options.podMaxBackoffDuration,
podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration,
activeQ: newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder),
backoffQ: backoffQ,
unschedulablePods: newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()),
preEnqueuePluginMap: options.preEnqueuePluginMap,
queueingHintMap: options.queueingHintMap,
@ -346,19 +345,24 @@ func NewPriorityQueue(
pluginMetricsSamplePercent: options.pluginMetricsSamplePercent,
moveRequestCycle: -1,
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
isPopFromBackoffQEnabled: isPopFromBackoffQEnabled,
}
pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder())
var backoffQPopper backoffQPopper
if isPopFromBackoffQEnabled {
backoffQPopper = backoffQ
}
pq.activeQ = newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder, backoffQPopper)
pq.nsLister = informerFactory.Core().V1().Namespaces().Lister()
pq.nominator = newPodNominator(options.podLister)
return pq
}
// Run starts the goroutine to pump from podBackoffQ to activeQ
// Run starts the goroutine to pump from backoffQ to activeQ
func (p *PriorityQueue) Run(logger klog.Logger) {
go wait.Until(func() {
go p.backoffQ.waitUntilAlignedWithOrderingWindow(func() {
p.flushBackoffQCompleted(logger)
}, 1.0*time.Second, p.stop)
}, p.stop)
go wait.Until(func() {
p.flushUnschedulablePodsLeftover(logger)
}, 30*time.Second, p.stop)
@ -553,25 +557,33 @@ func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.Pr
return s
}
// moveToActiveQ tries to add pod to active queue and remove it from unschedulable and backoff queues.
// It returns 2 parameters:
// 1. a boolean flag to indicate whether the pod is added successfully.
// 2. an error for the caller to act on.
// moveToActiveQ tries to add the pod to the active queue.
// If the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
// It returns a boolean flag to indicate whether the pod is added successfully.
func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
gatedBefore := pInfo.Gated
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
// If SchedulerPopFromBackoffQ feature gate is enabled,
// PreEnqueue plugins were called when the pod was added to the backoffQ.
// Don't need to repeat it here when the pod is directly moved from the backoffQ.
if !p.isPopFromBackoffQEnabled || event != framework.BackoffComplete {
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
}
added := false
p.activeQ.underLock(func(unlockedActiveQ unlockedActiveQueuer) {
if pInfo.Gated {
// Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins.
if unlockedActiveQ.Has(pInfo) {
if unlockedActiveQ.has(pInfo) {
return
}
if p.podBackoffQ.Has(pInfo) {
if p.backoffQ.has(pInfo) {
return
}
p.unschedulablePods.addOrUpdate(pInfo)
if p.unschedulablePods.get(pInfo.Pod) != nil {
return
}
p.unschedulablePods.addOrUpdate(pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue, because the pod is gated", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
return
}
if pInfo.InitialAttemptTimestamp == nil {
@ -579,13 +591,12 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
pInfo.InitialAttemptTimestamp = &now
}
unlockedActiveQ.AddOrUpdate(pInfo)
unlockedActiveQ.add(pInfo, event)
added = true
p.unschedulablePods.delete(pInfo.Pod, gatedBefore)
_ = p.podBackoffQ.Delete(pInfo) // Don't need to react when pInfo is not found.
p.backoffQ.delete(pInfo)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", activeQ)
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
if event == framework.EventUnscheduledPodAdd.Label() || event == framework.EventUnscheduledPodUpdate.Label() {
p.AddNominatedPod(logger, pInfo.PodInfo, nil)
}
@ -593,6 +604,28 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
return added
}
// moveToBackoffQ tries to add the pod to the backoff queue.
// If SchedulerPopFromBackoffQ feature gate is enabled and the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
// It returns a boolean flag to indicate whether the pod is added successfully.
func (p *PriorityQueue) moveToBackoffQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
// If SchedulerPopFromBackoffQ feature gate is enabled,
// PreEnqueue plugins are called on inserting pods to the backoffQ,
// not to call them again on popping out.
if p.isPopFromBackoffQEnabled {
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
if pInfo.Gated {
if p.unschedulablePods.get(pInfo.Pod) == nil {
p.unschedulablePods.addOrUpdate(pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
}
return false
}
}
p.backoffQ.add(logger, pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", backoffQ)
return true
}
// Add adds a pod to the active queue. It should be called only when a new pod
// is added so there is no chance the pod is already in active/unschedulable/backoff queues
func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) {
@ -641,10 +674,16 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
// If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it.
// The pod can be already in activeQ.
var exists bool
pInfo, exists = p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod))
pInfo, exists = p.backoffQ.get(newQueuedPodInfoForLookup(pod))
if !exists {
return false
}
// Delete pod from the backoffQ now to make sure it won't be popped from the backoffQ
// just before moving it to the activeQ
if deleted := p.backoffQ.delete(pInfo); !deleted {
// Pod was popped from the backoffQ in the meantime. Don't activate it.
return false
}
}
if pInfo == nil {
@ -656,13 +695,6 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
return p.moveToActiveQ(logger, pInfo, framework.ForceActivate)
}
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
boTime := p.getBackoffTime(podInfo)
return boTime.After(p.clock.Now())
}
// SchedulingCycle returns current scheduling cycle.
func (p *PriorityQueue) SchedulingCycle() int64 {
return p.activeQ.schedulingCycle()
@ -712,7 +744,7 @@ func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger
// addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
// request, then the pod is put in `podBackoffQ`.
// request, then the pod is put in `backoffQ`.
// TODO: This function is called only when p.isSchedulingQueueHintEnabled is false,
// and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
@ -736,13 +768,14 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
// - No unschedulable plugins are associated with this Pod,
// meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
// In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
p.podBackoffQ.AddOrUpdate(pInfo)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", backoffQ)
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", framework.ScheduleAttemptFailure).Inc()
if added := p.moveToBackoffQ(logger, pInfo, framework.ScheduleAttemptFailure); added {
if p.isPopFromBackoffQEnabled {
p.activeQ.broadcast()
}
}
} else {
p.unschedulablePods.addOrUpdate(pInfo)
p.unschedulablePods.addOrUpdate(pInfo, framework.ScheduleAttemptFailure)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", unschedulablePods)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
}
return nil
@ -751,7 +784,7 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
// AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
// request, then the pod is put in `podBackoffQ`.
// request, then the pod is put in `backoffQ`.
func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
p.lock.Lock()
defer p.lock.Unlock()
@ -767,7 +800,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
if p.activeQ.has(pInfo) {
return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
}
if p.podBackoffQ.Has(pInfo) {
if p.backoffQ.has(pInfo) {
return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
}
@ -792,7 +825,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
// In this case, we try to requeue this Pod to activeQ/backoffQ.
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, framework.ScheduleAttemptFailure)
logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins)
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
// When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out.
p.activeQ.broadcast()
}
@ -805,25 +838,12 @@ func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) {
p.lock.Lock()
defer p.lock.Unlock()
activated := false
for {
pInfo, ok := p.podBackoffQ.Peek()
if !ok || pInfo == nil {
break
}
pod := pInfo.Pod
if p.isPodBackingoff(pInfo) {
break
}
_, err := p.podBackoffQ.Pop()
if err != nil {
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
break
}
podsCompletedBackoff := p.backoffQ.popAllBackoffCompleted(logger)
for _, pInfo := range podsCompletedBackoff {
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
activated = true
}
}
if activated {
p.activeQ.broadcast()
}
@ -928,10 +948,8 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
}
// If the pod is in the backoff queue, update it there.
if pInfo, exists := p.podBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
if pInfo := p.backoffQ.update(newPod, oldPodInfo); pInfo != nil {
p.UpdateNominatedPod(logger, oldPod, pInfo.PodInfo)
p.podBackoffQ.AddOrUpdate(pInfo)
return
}
}
@ -953,7 +971,7 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
logger.V(5).Info("Pod moved to an internal scheduling queue because the Pod is updated", "pod", klog.KObj(newPod), "event", evt.Label(), "queue", queue)
p.unschedulablePods.delete(pInfo.Pod, gated)
}
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
p.activeQ.broadcast()
break
}
@ -961,21 +979,26 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
return
}
if isPodUpdated(oldPod, newPod) {
if p.isPodBackingoff(pInfo) {
p.podBackoffQ.AddOrUpdate(pInfo)
p.unschedulablePods.delete(pInfo.Pod, gated)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", framework.EventUnscheduledPodUpdate.Label(), "queue", backoffQ)
// Pod might have completed its backoff time while being in unschedulablePods,
// so we should check isPodBackingoff before moving the pod to backoffQ.
if p.backoffQ.isPodBackingoff(pInfo) {
if added := p.moveToBackoffQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
p.unschedulablePods.delete(pInfo.Pod, gated)
if p.isPopFromBackoffQEnabled {
p.activeQ.broadcast()
}
}
return
}
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
if added := p.moveToActiveQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
p.activeQ.broadcast()
}
return
}
// Pod update didn't make it schedulable, keep it in the unschedulable queue.
p.unschedulablePods.addOrUpdate(pInfo)
p.unschedulablePods.addOrUpdate(pInfo, framework.EventUnscheduledPodUpdate.Label())
return
}
// If pod is not in any of the queues, we put it in the active queue.
@ -992,12 +1015,14 @@ func (p *PriorityQueue) Delete(pod *v1.Pod) {
defer p.lock.Unlock()
p.DeleteNominatedPodIfExists(pod)
pInfo := newQueuedPodInfoForLookup(pod)
if err := p.activeQ.delete(pInfo); err != nil {
// The item was probably not found in the activeQ.
p.podBackoffQ.Delete(pInfo)
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
p.unschedulablePods.delete(pod, pInfo.Gated)
}
if err := p.activeQ.delete(pInfo); err == nil {
return
}
if deleted := p.backoffQ.delete(pInfo); deleted {
return
}
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
p.unschedulablePods.delete(pod, pInfo.Gated)
}
}
@ -1065,28 +1090,24 @@ func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event
// NOTE: this function assumes lock has been acquired in caller
func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string {
if strategy == queueSkip {
p.unschedulablePods.addOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
p.unschedulablePods.addOrUpdate(pInfo, event)
return unschedulablePods
}
if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) {
p.podBackoffQ.AddOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
return backoffQ
// Pod might have completed its backoff time while being in unschedulablePods,
// so we should check isPodBackingoff before moving the pod to backoffQ.
if strategy == queueAfterBackoff && p.backoffQ.isPodBackingoff(pInfo) {
if added := p.moveToBackoffQ(logger, pInfo, event); added {
return backoffQ
}
return unschedulablePods
}
// Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off.
if added := p.moveToActiveQ(logger, pInfo, event); added {
return activeQ
}
if pInfo.Gated {
// In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in moveToActiveQ.
return unschedulablePods
}
p.unschedulablePods.addOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
// Pod is gated. We don't have to push it back to unschedulable queue, because moveToActiveQ should already have done that.
return unschedulablePods
}
@ -1128,7 +1149,7 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn
p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated)
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label())
logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label(), "queue", queue, "hint", schedulingHint)
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
activated = true
}
}
@ -1180,6 +1201,20 @@ func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
return p.activeQ.list()
}
// PodsInBackoffQ returns all the Pods in the backoffQ.
func (p *PriorityQueue) PodsInBackoffQ() []*v1.Pod {
return p.backoffQ.list()
}
// UnschedulablePods returns all the pods in unschedulable state.
func (p *PriorityQueue) UnschedulablePods() []*v1.Pod {
var result []*v1.Pod
for _, pInfo := range p.unschedulablePods.podInfoMap {
result = append(result, pInfo.Pod)
}
return result
}
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
// GetPod searches for a pod in the activeQ, backoffQ, and unschedulablePods.
@ -1197,7 +1232,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
},
},
}
if pInfo, ok = p.podBackoffQ.Get(pInfoLookup); ok {
if pInfo, ok = p.backoffQ.get(pInfoLookup); ok {
return pInfo, true
}
if pInfo = p.unschedulablePods.get(pInfoLookup.Pod); pInfo != nil {
@ -1205,7 +1240,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
}
p.activeQ.underRLock(func(unlockedActiveQ unlockedActiveQueueReader) {
pInfo, ok = unlockedActiveQ.Get(pInfoLookup)
pInfo, ok = unlockedActiveQ.get(pInfoLookup)
})
return
}
@ -1216,15 +1251,15 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) {
p.lock.RLock()
defer p.lock.RUnlock()
result := p.activeQ.list()
result := p.PodsInActiveQ()
activeQLen := len(result)
for _, pInfo := range p.podBackoffQ.List() {
result = append(result, pInfo.Pod)
}
backoffQPods := p.PodsInBackoffQ()
backoffQLen := len(backoffQPods)
result = append(result, backoffQPods...)
for _, pInfo := range p.unschedulablePods.podInfoMap {
result = append(result, pInfo.Pod)
}
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap))
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, backoffQLen, len(p.unschedulablePods.podInfoMap))
}
// Note: this function assumes the caller locks both p.lock.RLock and p.activeQ.getLock().RLock.
@ -1232,7 +1267,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
pod := np.toPod()
pInfoLookup := newQueuedPodInfoForLookup(pod)
queuedPodInfo, exists := unlockedActiveQ.Get(pInfoLookup)
queuedPodInfo, exists := unlockedActiveQ.get(pInfoLookup)
if exists {
return queuedPodInfo.PodInfo
}
@ -1242,7 +1277,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
return queuedPodInfo.PodInfo
}
queuedPodInfo, exists = p.podBackoffQ.Get(pInfoLookup)
queuedPodInfo, exists = p.backoffQ.get(pInfoLookup)
if exists {
return queuedPodInfo.PodInfo
}
@ -1276,12 +1311,6 @@ func (p *PriorityQueue) NominatedPodsForNode(nodeName string) []*framework.PodIn
return pods
}
func (p *PriorityQueue) podsCompareBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := p.getBackoffTime(pInfo1)
bo2 := p.getBackoffTime(pInfo2)
return bo1.Before(bo2)
}
// newQueuedPodInfo builds a QueuedPodInfo object.
func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
now := p.clock.Now()
@ -1296,33 +1325,6 @@ func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framew
}
}
// getBackoffTime returns the time that podInfo completes backoff
func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
duration := p.calculateBackoffDuration(podInfo)
backoffTime := podInfo.Timestamp.Add(duration)
return backoffTime
}
// calculateBackoffDuration is a helper function for calculating the backoffDuration
// based on the number of attempts the pod has made.
func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
if podInfo.Attempts == 0 {
// When the Pod hasn't experienced any scheduling attempts,
// they aren't obliged to get a backoff penalty at all.
return 0
}
duration := p.podInitialBackoffDuration
for i := 1; i < podInfo.Attempts; i++ {
// Use subtraction instead of addition or multiplication to avoid overflow.
if duration > p.podMaxBackoffDuration-duration {
return p.podMaxBackoffDuration
}
duration += duration
}
return duration
}
// UnschedulablePods holds pods that cannot be scheduled. This data structure
// is used to implement unschedulablePods.
type UnschedulablePods struct {
@ -1335,7 +1337,8 @@ type UnschedulablePods struct {
}
// addOrUpdate adds a pod to the unschedulable podInfoMap.
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
// The event should show which event triggered the addition and is used for the metric recording.
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo, event string) {
podID := u.keyFunc(pInfo.Pod)
if _, exists := u.podInfoMap[podID]; !exists {
if pInfo.Gated && u.gatedRecorder != nil {
@ -1343,6 +1346,7 @@ func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
} else if !pInfo.Gated && u.unschedulableRecorder != nil {
u.unschedulableRecorder.Inc()
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
}
u.podInfoMap[podID] = pInfo
}

View File

@ -33,6 +33,7 @@ import (
"k8s.io/client-go/tools/cache"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
corev1nodeaffinity "k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
@ -160,6 +161,16 @@ func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) {
logger.V(4).Info("Update event for unscheduled pod", "pod", klog.KObj(newPod))
sched.SchedulingQueue.Update(logger, oldPod, newPod)
if hasNominatedNodeNameChanged(oldPod, newPod) {
// Nominated node changed in pod, so we need to treat it as if the pod was deleted from the old nominated node,
// because the scheduler treats such a pod as if it was already assigned when scheduling lower or equal priority pods.
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, oldPod, nil, getLEPriorityPreCheck(corev1helpers.PodPriority(oldPod)))
}
}
// hasNominatedNodeNameChanged returns true when nominated node name has existed but changed.
func hasNominatedNodeNameChanged(oldPod, newPod *v1.Pod) bool {
return len(oldPod.Status.NominatedNodeName) > 0 && oldPod.Status.NominatedNodeName != newPod.Status.NominatedNodeName
}
func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) {
@ -195,8 +206,21 @@ func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) {
// If a waiting pod is rejected, it indicates it's previously assumed and we're
// removing it from the scheduler cache. In this case, signal a AssignedPodDelete
// event to immediately retry some unscheduled Pods.
// Similarly when a pod that had nominated node is deleted, it can unblock scheduling of other pods,
// because the lower or equal priority pods treat such a pod as if it was assigned.
if fwk.RejectWaitingPod(pod.UID) {
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, nil)
} else if pod.Status.NominatedNodeName != "" {
// Note that a nominated pod can fall into `RejectWaitingPod` case as well,
// but in that case the `MoveAllToActiveOrBackoffQueue` already covered lower priority pods.
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, getLEPriorityPreCheck(corev1helpers.PodPriority(pod)))
}
}
// getLEPriorityPreCheck is a PreEnqueueCheck function that selects only lower or equal priority pods.
func getLEPriorityPreCheck(priority int32) queue.PreEnqueueCheck {
return func(pod *v1.Pod) bool {
return corev1helpers.PodPriority(pod) <= priority
}
}
@ -343,6 +367,7 @@ func addAllEventHandlers(
informerFactory informers.SharedInformerFactory,
dynInformerFactory dynamicinformer.DynamicSharedInformerFactory,
resourceClaimCache *assumecache.AssumeCache,
resourceSliceTracker *resourceslicetracker.Tracker,
gvkMap map[framework.EventResource]framework.ActionType,
) error {
var (
@ -532,7 +557,7 @@ func addAllEventHandlers(
}
case framework.ResourceSlice:
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
if handlerRegistration, err = informerFactory.Resource().V1beta1().ResourceSlices().Informer().AddEventHandler(
if handlerRegistration, err = resourceSliceTracker.AddEventHandler(
buildEvtResHandler(at, framework.ResourceSlice),
); err != nil {
return err

View File

@ -31,6 +31,8 @@ const (
ScheduleAttemptFailure = "ScheduleAttemptFailure"
// BackoffComplete is the event when a pod finishes backoff.
BackoffComplete = "BackoffComplete"
// PopFromBackoffQ is the event when a pod is popped from backoffQ when activeQ is empty.
PopFromBackoffQ = "PopFromBackoffQ"
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
// to activeQ. Usually it's triggered by plugin implementations.
ForceActivate = "ForceActivate"
@ -130,7 +132,7 @@ func extractPodTolerationChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
// Due to API validation, the user can add, but cannot modify or remove tolerations.
// So, it's enough to just check the length of tolerations to notice the update.
// And, any updates in tolerations could make Pod schedulable.
return UpdatePodTolerations
return UpdatePodToleration
}
return none

View File

@ -26,8 +26,8 @@ import (
"sync"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/google/go-cmp/cmp" //nolint:depguard
"github.com/google/go-cmp/cmp/cmpopts" //nolint:depguard
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
@ -227,8 +227,8 @@ const (
// Pending means that the scheduling process is finished successfully,
// but the plugin wants to stop the scheduling cycle/binding cycle here.
//
// For example, the DRA plugin sometimes needs to wait for the external device driver
// to provision the resource for the Pod.
// For example, if your plugin has to notify the scheduling result to an external component,
// and wait for it to complete something **before** binding.
// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
// because in this case, the scheduler decides where the Pod can go successfully,
// but we need to wait for the external component to do something based on that scheduling result.
@ -609,7 +609,7 @@ type ScorePlugin interface {
// Score is called on each filtered node. It must return success and an integer
// indicating the rank of the node. All scoring plugins must return success or
// the pod will be rejected.
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeInfo *NodeInfo) (int64, *Status)
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
ScoreExtensions() ScoreExtensions

View File

@ -50,8 +50,13 @@ type SharedLister interface {
// ResourceSliceLister can be used to obtain ResourceSlices.
type ResourceSliceLister interface {
// List returns a list of all ResourceSlices.
List() ([]*resourceapi.ResourceSlice, error)
// ListWithDeviceTaintRules returns a list of all ResourceSlices with DeviceTaintRules applied
// if the DRADeviceTaints feature is enabled, otherwise without them.
//
// k8s.io/dynamic-resource-allocation/resourceslice/tracker provides an implementation
// of the necessary logic. That tracker can be instantiated as a replacement for
// a normal ResourceSlice informer and provides a ListPatchedResourceSlices method.
ListWithDeviceTaintRules() ([]*resourceapi.ResourceSlice, error)
}
// DeviceClassLister can be used to obtain DeviceClasses.

View File

@ -51,15 +51,28 @@ func chunkSizeFor(n, parallelism int) int {
return s
}
// numWorkersForChunkSize returns number of workers (goroutines)
// that will be created in workqueue.ParallelizeUntil
// for given parallelism, pieces and chunkSize values.
func numWorkersForChunkSize(parallelism, pieces, chunkSize int) int {
chunks := (pieces + chunkSize - 1) / chunkSize
if chunks < parallelism {
return chunks
}
return parallelism
}
// Until is a wrapper around workqueue.ParallelizeUntil to use in scheduling algorithms.
// A given operation will be a label that is recorded in the goroutine metric.
func (p Parallelizer) Until(ctx context.Context, pieces int, doWorkPiece workqueue.DoWorkPieceFunc, operation string) {
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
withMetrics := func(piece int) {
goroutinesMetric.Inc()
doWorkPiece(piece)
goroutinesMetric.Dec()
}
chunkSize := chunkSizeFor(pieces, p.parallelism)
workers := numWorkersForChunkSize(p.parallelism, pieces, chunkSize)
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, withMetrics, workqueue.WithChunkSize(chunkSizeFor(pieces, p.parallelism)))
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
// Calling single Add with workers' count is more efficient than calling Inc or Dec per each work piece.
// This approach improves performance of some plugins (affinity, topology spreading) as well as preemption.
goroutinesMetric.Add(float64(workers))
defer goroutinesMetric.Add(float64(-workers))
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, doWorkPiece, workqueue.WithChunkSize(chunkSize))
}

View File

@ -136,10 +136,14 @@ func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 {
return n
}
// getOffsetRand is a dedicated random source for GetOffsetAndNumCandidates calls.
// It defaults to rand.Int31n, but is a package variable so it can be overridden to make unit tests deterministic.
var getOffsetRand = rand.Int31n
// GetOffsetAndNumCandidates chooses a random offset and calculates the number
// of candidates that should be shortlisted for dry running preemption.
func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) {
return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes)
return getOffsetRand(numNodes), pl.calculateNumCandidates(numNodes)
}
// This function is not applicable for out-of-tree preemption plugins that exercise

View File

@ -27,6 +27,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
@ -44,8 +45,9 @@ type DefaultDRAManager struct {
deviceClassLister *deviceClassLister
}
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, resourceSliceTracker *resourceslicetracker.Tracker, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
logger := klog.FromContext(ctx)
manager := &DefaultDRAManager{
resourceClaimTracker: &claimTracker{
cache: claimsCache,
@ -53,7 +55,7 @@ func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, in
allocatedDevices: newAllocatedDevices(logger),
logger: logger,
},
resourceSliceLister: &resourceSliceLister{sliceLister: informerFactory.Resource().V1beta1().ResourceSlices().Lister()},
resourceSliceLister: &resourceSliceLister{tracker: resourceSliceTracker},
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1beta1().DeviceClasses().Lister()},
}
@ -79,11 +81,11 @@ func (s *DefaultDRAManager) DeviceClasses() framework.DeviceClassLister {
var _ framework.ResourceSliceLister = &resourceSliceLister{}
type resourceSliceLister struct {
sliceLister resourcelisters.ResourceSliceLister
tracker *resourceslicetracker.Tracker
}
func (l *resourceSliceLister) List() ([]*resourceapi.ResourceSlice, error) {
return l.sliceLister.List(labels.Everything())
func (l *resourceSliceLister) ListWithDeviceTaintRules() ([]*resourceapi.ResourceSlice, error) {
return l.tracker.ListPatchedResourceSlices()
}
var _ framework.DeviceClassLister = &deviceClassLister{}

View File

@ -21,9 +21,10 @@ import (
"errors"
"fmt"
"slices"
"strings"
"sync"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp" //nolint:depguard
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
@ -101,9 +102,12 @@ type informationForClaim struct {
// DynamicResources is a plugin that ensures that ResourceClaims are allocated.
type DynamicResources struct {
enabled bool
enableAdminAccess bool
enableSchedulingQueueHint bool
enabled bool
enableAdminAccess bool
enablePrioritizedList bool
enableSchedulingQueueHint bool
enablePartitionableDevices bool
enableDeviceTaints bool
fh framework.Handle
clientset kubernetes.Interface
@ -119,9 +123,12 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
}
pl := &DynamicResources{
enabled: true,
enableAdminAccess: fts.EnableDRAAdminAccess,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
enabled: true,
enableAdminAccess: fts.EnableDRAAdminAccess,
enableDeviceTaints: fts.EnableDRADeviceTaints,
enablePrioritizedList: fts.EnableDRAPrioritizedList,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
enablePartitionableDevices: fts.EnablePartitionableDevices,
fh: fh,
clientset: fh.ClientSet(),
@ -176,7 +183,7 @@ func (pl *DynamicResources) EventsToRegister(_ context.Context) ([]framework.Clu
// A pod might be waiting for a class to get created or modified.
{Event: framework.ClusterEvent{Resource: framework.DeviceClass, ActionType: framework.Add | framework.Update}},
// Adding or updating a ResourceSlice might make a pod schedulable because new resources became available.
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterResourceSliceChange},
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}},
}
return events, nil
@ -288,38 +295,6 @@ func (pl *DynamicResources) isSchedulableAfterPodChange(logger klog.Logger, pod
return framework.Queue, nil
}
// isSchedulableAfterResourceSliceChange is invoked for add and update slice events reported by
// an informer. Such changes can make an unschedulable pod schedulable when the pod requests a device
// and the change adds a suitable device.
//
// For the sake of faster execution and avoiding code duplication, isSchedulableAfterResourceSliceChange
// only checks whether the pod uses claims. All of the more detailed checks are done in the scheduling
// attempt.
//
// The delete claim event will not invoke it, so newObj will never be nil.
func (pl *DynamicResources) isSchedulableAfterResourceSliceChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedSlice, err := schedutil.As[*resourceapi.ResourceSlice](oldObj, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterResourceSliceChange: %w", err)
}
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
logger.V(6).Info("pod is not schedulable after resource slice change", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice), "reason", err.Error())
return framework.QueueSkip, nil
}
// We could check what got changed in the slice, but right now that's likely to be
// about the spec (there's no status yet...).
// We could check whether all claims use classic DRA, but that doesn't seem worth it.
// Let's assume that changing the slice may make the pod schedulable.
logger.V(5).Info("ResourceSlice change might make pod schedulable", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice))
return framework.Queue, nil
}
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
func (pl *DynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourceapi.ResourceClaim, error) {
claims := make([]*resourceapi.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
@ -437,20 +412,22 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
// initial set of potential nodes before we ask the
// driver(s) for information about the specific pod.
for _, request := range claim.Spec.Devices.Requests {
if request.DeviceClassName == "" {
return nil, statusError(logger, fmt.Errorf("request %s: unsupported request type", request.Name))
}
_, err := pl.draManager.DeviceClasses().Get(request.DeviceClassName)
if err != nil {
// If the class cannot be retrieved, allocation cannot proceed.
if apierrors.IsNotFound(err) {
// Here we mark the pod as "unschedulable", so it'll sleep in
// the unscheduleable queue until a DeviceClass event occurs.
return nil, statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", request.Name, request.DeviceClassName))
// The requirements differ depending on whether the request has a list of
// alternative subrequests defined in the firstAvailable field.
if len(request.FirstAvailable) == 0 {
if status := pl.validateDeviceClass(logger, request.DeviceClassName, request.Name); status != nil {
return nil, status
}
} else {
if !pl.enablePrioritizedList {
return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s, request %s: has subrequests, but the DRAPrioritizedList feature is disabled", klog.KObj(claim), request.Name))
}
for _, subRequest := range request.FirstAvailable {
qualRequestName := strings.Join([]string{request.Name, subRequest.Name}, "/")
if status := pl.validateDeviceClass(logger, subRequest.DeviceClassName, qualRequestName); status != nil {
return nil, status
}
}
// Other error, retry with backoff.
return nil, statusError(logger, fmt.Errorf("request %s: look up device class: %w", request.Name, err))
}
}
}
@ -475,11 +452,17 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
if err != nil {
return nil, statusError(logger, err)
}
slices, err := pl.draManager.ResourceSlices().List()
slices, err := pl.draManager.ResourceSlices().ListWithDeviceTaintRules()
if err != nil {
return nil, statusError(logger, err)
}
allocator, err := structured.NewAllocator(ctx, pl.enableAdminAccess, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
features := structured.Features{
AdminAccess: pl.enableAdminAccess,
PrioritizedList: pl.enablePrioritizedList,
PartitionableDevices: pl.enablePartitionableDevices,
DeviceTaints: pl.enableDeviceTaints,
}
allocator, err := structured.NewAllocator(ctx, features, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
if err != nil {
return nil, statusError(logger, err)
}
@ -491,6 +474,23 @@ func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.Cycl
return nil, nil
}
func (pl *DynamicResources) validateDeviceClass(logger klog.Logger, deviceClassName, requestName string) *framework.Status {
if deviceClassName == "" {
return statusError(logger, fmt.Errorf("request %s: unsupported request type", requestName))
}
_, err := pl.draManager.DeviceClasses().Get(deviceClassName)
if err != nil {
// If the class cannot be retrieved, allocation cannot proceed.
if apierrors.IsNotFound(err) {
// Here we mark the pod as "unschedulable", so it'll sleep in
// the unscheduleable queue until a DeviceClass event occurs.
return statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", requestName, deviceClassName))
}
}
return nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *DynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
return nil
@ -608,6 +608,11 @@ func (pl *DynamicResources) PostFilter(ctx context.Context, cs *framework.CycleS
if !pl.enabled {
return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
}
// If a Pod doesn't have any resource claims attached to it, there is no need for further processing.
// Thus we provide a fast path for this case to avoid unnecessary computations.
if len(pod.Spec.ResourceClaims) == 0 {
return nil, framework.NewStatus(framework.Unschedulable)
}
logger := klog.FromContext(ctx)
state, err := getStateData(cs)
if err != nil {

View File

@ -20,9 +20,12 @@ package feature
// This struct allows us to break the dependency of the plugins on
// the internal k8s features pkg.
type Features struct {
EnableDRAPrioritizedList bool
EnableDRAAdminAccess bool
EnableDRADeviceTaints bool
EnableDynamicResourceAllocation bool
EnableVolumeCapacityPriority bool
EnableVolumeAttributesClass bool
EnableCSIMigrationPortworx bool
EnableNodeInclusionPolicyInPodTopologySpread bool
EnableMatchLabelKeysInPodTopologySpread bool
EnableInPlacePodVerticalScaling bool
@ -30,4 +33,6 @@ type Features struct {
EnableSchedulingQueueHint bool
EnableAsyncPreemption bool
EnablePodLevelResources bool
EnablePartitionableDevices bool
EnableStorageCapacityScoring bool
}

View File

@ -18,7 +18,6 @@ package imagelocality
import (
"context"
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
@ -51,12 +50,7 @@ func (pl *ImageLocality) Name() string {
}
// Score invoked at the score extension point.
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
nodeInfos, err := pl.handle.SnapshotSharedLister().NodeInfos().List()
if err != nil {
return 0, framework.AsStatus(err)

View File

@ -89,15 +89,21 @@ type topologyPair struct {
}
type topologyToMatchedTermCount map[topologyPair]int64
func (m topologyToMatchedTermCount) append(toAppend topologyToMatchedTermCount) {
for pair := range toAppend {
m[pair] += toAppend[pair]
func (m topologyToMatchedTermCount) merge(toMerge topologyToMatchedTermCount) {
for pair, count := range toMerge {
m[pair] += count
}
}
func (m topologyToMatchedTermCount) mergeWithList(toMerge topologyToMatchedTermCountList) {
for _, tmtc := range toMerge {
m[tmtc.topologyPair] += tmtc.count
}
}
func (m topologyToMatchedTermCount) clone() topologyToMatchedTermCount {
copy := make(topologyToMatchedTermCount, len(m))
copy.append(m)
copy.merge(m)
return copy
}
@ -134,6 +140,48 @@ func (m topologyToMatchedTermCount) updateWithAntiAffinityTerms(terms []framewor
}
}
// topologyToMatchedTermCountList is a slice equivalent of topologyToMatchedTermCount map.
// The use of slice improves the performance of PreFilter,
// especially due to faster iteration when merging than with topologyToMatchedTermCount.
type topologyToMatchedTermCountList []topologyPairCount
type topologyPairCount struct {
topologyPair topologyPair
count int64
}
func (m *topologyToMatchedTermCountList) append(node *v1.Node, tk string, value int64) {
if tv, ok := node.Labels[tk]; ok {
pair := topologyPair{key: tk, value: tv}
*m = append(*m, topologyPairCount{
topologyPair: pair,
count: value,
})
}
}
// appends the specified value to the topologyToMatchedTermCountList
// for each affinity term if "targetPod" matches ALL terms.
func (m *topologyToMatchedTermCountList) appendWithAffinityTerms(
terms []framework.AffinityTerm, pod *v1.Pod, node *v1.Node, value int64) {
if podMatchesAllAffinityTerms(terms, pod) {
for _, t := range terms {
m.append(node, t.TopologyKey, value)
}
}
}
// appends the specified value to the topologyToMatchedTermCountList
// for each anti-affinity term matched the target pod.
func (m *topologyToMatchedTermCountList) appendWithAntiAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, value int64) {
// Check anti-affinity terms.
for _, t := range terms {
if t.Matches(pod, nsLabels) {
m.append(node, t.TopologyKey, value)
}
}
}
// returns true IFF the given pod matches all the given terms.
func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) bool {
if len(terms) == 0 {
@ -153,25 +201,26 @@ func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) boo
// 1. Whether it has PodAntiAffinity
// 2. Whether any AntiAffinityTerm matches the incoming pod
func (pl *InterPodAffinity) getExistingAntiAffinityCounts(ctx context.Context, pod *v1.Pod, nsLabels labels.Set, nodes []*framework.NodeInfo) topologyToMatchedTermCount {
topoMaps := make([]topologyToMatchedTermCount, len(nodes))
antiAffinityCountsList := make([]topologyToMatchedTermCountList, len(nodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := nodes[i]
node := nodeInfo.Node()
topoMap := make(topologyToMatchedTermCount)
antiAffinityCounts := make(topologyToMatchedTermCountList, 0)
for _, existingPod := range nodeInfo.PodsWithRequiredAntiAffinity {
topoMap.updateWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
antiAffinityCounts.appendWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
}
if len(topoMap) != 0 {
topoMaps[atomic.AddInt32(&index, 1)] = topoMap
if len(antiAffinityCounts) != 0 {
antiAffinityCountsList[atomic.AddInt32(&index, 1)] = antiAffinityCounts
}
}
pl.parallelizer.Until(ctx, len(nodes), processNode, pl.Name())
result := make(topologyToMatchedTermCount)
// Traditional for loop is slightly faster in this case than its "for range" equivalent.
for i := 0; i <= int(index); i++ {
result.append(topoMaps[i])
result.mergeWithList(antiAffinityCountsList[i])
}
return result
@ -188,20 +237,20 @@ func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Co
return affinityCounts, antiAffinityCounts
}
affinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
antiAffinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
affinityCountsList := make([]topologyToMatchedTermCountList, len(allNodes))
antiAffinityCountsList := make([]topologyToMatchedTermCountList, len(allNodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
affinity := make(topologyToMatchedTermCount)
antiAffinity := make(topologyToMatchedTermCount)
affinity := make(topologyToMatchedTermCountList, 0)
antiAffinity := make(topologyToMatchedTermCountList, 0)
for _, existingPod := range nodeInfo.Pods {
affinity.updateWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
affinity.appendWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
antiAffinity.updateWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
antiAffinity.appendWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
}
if len(affinity) > 0 || len(antiAffinity) > 0 {
@ -213,8 +262,8 @@ func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Co
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for i := 0; i <= int(index); i++ {
affinityCounts.append(affinityCountsList[i])
antiAffinityCounts.append(antiAffinityCountsList[i])
affinityCounts.mergeWithList(affinityCountsList[i])
antiAffinityCounts.mergeWithList(antiAffinityCountsList[i])
}
return affinityCounts, antiAffinityCounts

View File

@ -211,7 +211,7 @@ func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod
}
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
@ -221,11 +221,35 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
return framework.Queue, err
}
// When queuing this Pod:
// - 1. A new node is added with the pod affinity topologyKey, the pod may become schedulable.
// - 2. The original node does not have the pod affinity topologyKey but the modified node does, the pod may become schedulable.
// - 3. Both the original and modified nodes have the pod affinity topologyKey and they differ, the pod may become schedulable.
for _, term := range terms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
if originalNode == nil {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
// Case 1: A new node is added with the pod affinity topologyKey.
logger.V(5).Info("A node with a matched pod affinity topologyKey was added and it may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
continue
}
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
if !originalHasKey && modifiedHasKey {
// Case 2: Original node does not have the pod affinity topologyKey, but the modified node does.
logger.V(5).Info("A node got updated to have the topology key of pod affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
return framework.Queue, nil
}
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
// Case 3: Both nodes have the pod affinity topologyKey, but the values differ.
logger.V(5).Info("A node is moved to a different domain of pod affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
}
@ -234,11 +258,39 @@ func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod
return framework.Queue, err
}
// When queuing this Pod:
// - 1. A new node is added, the pod may become schedulable.
// - 2. The original node have the pod anti-affinity topologyKey but the modified node does not, the pod may become schedulable.
// - 3. Both the original and modified nodes have the pod anti-affinity topologyKey and they differ, the pod may become schedulable.
for _, term := range antiTerms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
if originalNode == nil {
// Case 1: A new node is added.
// We always requeue the Pod with anti-affinity because:
// - the node without the topology key is always allowed to have a Pod with anti-affinity.
// - the addition of a node with the topology key makes Pods schedulable only when the topology it joins doesn't have any Pods that the Pod hates.
// But, it's out-of-scope of this QHint to check which Pods are in the topology this Node is in.
logger.V(5).Info("A node was added and it may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
return framework.Queue, nil
}
originalTopologyValue, originalHasKey := originalNode.Labels[term.TopologyKey]
modifiedTopologyValue, modifiedHasKey := modifiedNode.Labels[term.TopologyKey]
if originalHasKey && !modifiedHasKey {
// Case 2: The original node have the pod anti-affinity topologyKey but the modified node does not.
// Note that we don't need to check the opposite case (!originalHasKey && modifiedHasKey)
// because the node without the topology label can always accept pods with pod anti-affinity.
logger.V(5).Info("A node got updated to not have the topology key of pod anti-affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
if originalHasKey && modifiedHasKey && (originalTopologyValue != modifiedTopologyValue) {
// Case 3: Both nodes have the pod anti-affinity topologyKey, but the values differ.
logger.V(5).Info("A node is moved to a different domain of pod anti-affinity, which may make the pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
}
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",

View File

@ -130,10 +130,6 @@ func (pl *InterPodAffinity) PreScore(
pod *v1.Pod,
nodes []*framework.NodeInfo,
) *framework.Status {
if len(nodes) == 0 {
// No nodes to score.
return framework.NewStatus(framework.Skip)
}
if pl.sharedLister == nil {
return framework.NewStatus(framework.Error, "empty shared lister in InterPodAffinity PreScore")
@ -240,11 +236,7 @@ func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error)
// The "score" returned in this function is the sum of weights got from cycleState which have its topologyKey matching with the node's labels.
// it is normalized later.
// Note: the returned "score" is positive for pod-affinity, and negative for pod-antiaffinity.
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("failed to get node %q from Snapshot: %w", nodeName, err))
}
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)

View File

@ -238,9 +238,6 @@ func (s *preScoreState) Clone() framework.StateData {
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
if err != nil {
return framework.AsStatus(err)
@ -259,12 +256,7 @@ func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.Cycl
// Score returns the sum of the weights of the terms that match the Node.
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
// default affinity.
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
node := nodeInfo.Node()
var count int64

View File

@ -143,7 +143,7 @@ func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Po
}
// If the deleted pod is unscheduled, it doesn't make the target pod schedulable.
if deletedPod.Spec.NodeName == "" {
if deletedPod.Spec.NodeName == "" && deletedPod.Status.NominatedNodeName == "" {
logger.V(4).Info("the deleted pod is unscheduled and it doesn't make the target pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}

View File

@ -63,8 +63,15 @@ func (s *balancedAllocationPreScoreState) Clone() framework.StateData {
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
podRequests := ba.calculatePodResourceRequestList(pod, ba.resources)
if ba.isBestEffortPod(podRequests) {
// Skip BalancedAllocation scoring for best-effort pods to
// prevent a large number of pods from being scheduled to the same node.
// See https://github.com/kubernetes/kubernetes/issues/129138 for details.
return framework.NewStatus(framework.Skip)
}
state := &balancedAllocationPreScoreState{
podRequests: ba.calculatePodResourceRequestList(pod, ba.resources),
podRequests: podRequests,
}
cycleState.Write(balancedAllocationPreScoreStateKey, state)
return nil
@ -89,15 +96,13 @@ func (ba *BalancedAllocation) Name() string {
}
// Score invoked at the score extension point.
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := ba.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
s, err := getBalancedAllocationPreScoreState(state)
if err != nil {
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
if ba.isBestEffortPod(s.podRequests) {
return 0, nil
}
}
// ba.score favors nodes with balanced resource usage rate.
@ -127,10 +132,12 @@ func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h framework
return &BalancedAllocation{
handle: h,
resourceAllocationScorer: resourceAllocationScorer{
Name: BalancedAllocationName,
scorer: balancedResourceScorer,
useRequested: true,
resources: args.Resources,
Name: BalancedAllocationName,
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
enablePodLevelResources: fts.EnablePodLevelResources,
scorer: balancedResourceScorer,
useRequested: true,
resources: args.Resources,
},
}, nil
}
@ -157,7 +164,6 @@ func balancedResourceScorer(requested, allocable []int64) int64 {
// Otherwise, set the std to zero is enough.
if len(resourceToFractions) == 2 {
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
} else if len(resourceToFractions) > 2 {
mean := totalFraction / float64(len(resourceToFractions))
var sum float64

View File

@ -21,7 +21,7 @@ import (
"fmt"
"strings"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp" //nolint:depguard
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
@ -294,7 +294,7 @@ func (f *Fit) isSchedulableAfterPodEvent(logger klog.Logger, pod *v1.Pod, oldObj
}
if modifiedPod == nil {
if originalPod.Spec.NodeName == "" {
if originalPod.Spec.NodeName == "" && originalPod.Status.NominatedNodeName == "" {
logger.V(5).Info("the deleted pod was unscheduled and it wouldn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
@ -579,12 +579,7 @@ func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo, ignor
}
// Score invoked at the Score extension point.
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := f.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
s, err := getPreScoreState(state)
if err != nil {
s = &preScoreState{

View File

@ -21,11 +21,9 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
resourcehelper "k8s.io/component-helpers/resource"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
@ -36,7 +34,9 @@ type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
// resourceAllocationScorer contains information to calculate resource allocation score.
type resourceAllocationScorer struct {
Name string
Name string
enableInPlacePodVerticalScaling bool
enablePodLevelResources bool
// used to decide whether to use Requested or NonZeroRequested for
// cpu and memory.
useRequested bool
@ -118,9 +118,9 @@ func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(logger kl
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
opts := resourcehelper.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
UseStatusResources: r.enableInPlacePodVerticalScaling,
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
SkipPodLevelResources: !r.enablePodLevelResources,
}
if !r.useRequested {
@ -146,3 +146,12 @@ func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod,
}
return podRequests
}
func (r *resourceAllocationScorer) isBestEffortPod(podRequests []int64) bool {
for _, request := range podRequests {
if request != 0 {
return false
}
}
return true
}

View File

@ -17,7 +17,7 @@ limitations under the License.
package noderesources
import (
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/google/go-cmp/cmp/cmpopts" //nolint:depguard
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"

View File

@ -68,7 +68,7 @@ func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.Cl
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodToleration}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}

View File

@ -63,7 +63,8 @@ type CSILimits struct {
scLister storagelisters.StorageClassLister
vaLister storagelisters.VolumeAttachmentLister
randomVolumeIDPrefix string
enableCSIMigrationPortworx bool
randomVolumeIDPrefix string
translator InTreeToCSITranslator
}
@ -87,9 +88,10 @@ func (pl *CSILimits) EventsToRegister(_ context.Context) ([]framework.ClusterEve
// We don't register any `QueueingHintFn` intentionally
// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSINodeUpdated},
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterPVCAdded},
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}},
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterVolumeAttachmentDeleted},
}, nil
}
@ -103,7 +105,7 @@ func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Po
return framework.QueueSkip, nil
}
if deletedPod.Spec.NodeName == "" {
if deletedPod.Spec.NodeName == "" && deletedPod.Status.NominatedNodeName == "" {
return framework.QueueSkip, nil
}
@ -149,6 +151,85 @@ func (pl *CSILimits) isSchedulableAfterPVCAdded(logger klog.Logger, pod *v1.Pod,
return framework.QueueSkip, nil
}
func (pl *CSILimits) isSchedulableAfterVolumeAttachmentDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedVolumeAttachment, _, err := util.As[*storagev1.VolumeAttachment](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterVolumeAttachmentDeleted: %w", err)
}
for _, vol := range pod.Spec.Volumes {
// Check if the pod volume uses a PVC
// If it does, return Queue
if vol.PersistentVolumeClaim != nil {
logger.V(5).Info("Pod volume uses PersistentVolumeClaim, which might make this pod schedulable due to VolumeAttachment deletion", "pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment), "volume", vol.Name)
return framework.Queue, nil
}
if !pl.translator.IsInlineMigratable(&vol) {
continue
}
translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(logger, &vol, pod.Namespace)
if err != nil || translatedPV == nil {
return framework.Queue, fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
}
if translatedPV.Spec.CSI != nil && deletedVolumeAttachment.Spec.Attacher == translatedPV.Spec.CSI.Driver {
// deleted VolumeAttachment Attacher matches the translated PV CSI driver
logger.V(5).Info("Pod volume is an Inline Migratable volume that matches the CSI driver, which might make this pod schedulable due to VolumeAttachment deletion",
"pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment),
"volume", vol.Name, "csiDriver", translatedPV.Spec.CSI.Driver,
)
return framework.Queue, nil
}
}
logger.V(5).Info("the VolumeAttachment deletion wouldn't make this pod schedulable because the pod has no volume related to a deleted VolumeAttachment",
"pod", klog.KObj(pod), "volumeAttachment", klog.KObj(deletedVolumeAttachment))
return framework.QueueSkip, nil
}
func (pl *CSILimits) isSchedulableAfterCSINodeUpdated(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
oldCSINode, newCSINode, err := util.As[*storagev1.CSINode](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterCSINodeUpdated: %w", err)
}
oldLimits := make(map[string]int32)
for _, d := range oldCSINode.Spec.Drivers {
var count int32
if d.Allocatable != nil && d.Allocatable.Count != nil {
count = *d.Allocatable.Count
}
oldLimits[d.Name] = count
}
// Compare new driver limits vs. old. If limit increased, queue pod.
for _, d := range newCSINode.Spec.Drivers {
var oldLimit int32
if val, exists := oldLimits[d.Name]; exists {
oldLimit = val
}
newLimit := int32(0)
if d.Allocatable != nil && d.Allocatable.Count != nil {
newLimit = *d.Allocatable.Count
}
if newLimit > oldLimit {
logger.V(5).Info("CSINode driver limit increased, might make this pod schedulable",
"pod", klog.KObj(pod),
"driver", d.Name,
"oldLimit", oldLimit,
"newLimit", newLimit,
)
return framework.Queue, nil
}
}
// If no driver limit was increased, skip queueing.
return framework.QueueSkip, nil
}
// PreFilter invoked at the prefilter extension point
//
// If the pod haven't those types of volumes, we'll skip the Filter phase
@ -339,7 +420,7 @@ func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Vol
if err != nil {
return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
}
if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
if !isCSIMigrationOn(csiNode, inTreeProvisionerName, pl.enableCSIMigrationPortworx) {
csiNodeName := ""
if csiNode != nil {
csiNodeName = csiNode.Name
@ -400,7 +481,7 @@ func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSI
return "", ""
}
if !isCSIMigrationOn(csiNode, pluginName) {
if !isCSIMigrationOn(csiNode, pluginName, pl.enableCSIMigrationPortworx) {
logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
return "", ""
}
@ -448,7 +529,7 @@ func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storage
provisioner := storageClass.Provisioner
if pl.translator.IsMigratableIntreePluginByName(provisioner) {
if !isCSIMigrationOn(csiNode, provisioner) {
if !isCSIMigrationOn(csiNode, provisioner, pl.enableCSIMigrationPortworx) {
logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
return "", ""
}
@ -475,13 +556,14 @@ func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts fe
csiTranslator := csitrans.New()
return &CSILimits{
csiNodeLister: csiNodesLister,
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
vaLister: vaLister,
randomVolumeIDPrefix: rand.String(32),
translator: csiTranslator,
csiNodeLister: csiNodesLister,
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
vaLister: vaLister,
enableCSIMigrationPortworx: fts.EnableCSIMigrationPortworx,
randomVolumeIDPrefix: rand.String(32),
translator: csiTranslator,
}, nil
}

View File

@ -22,14 +22,12 @@ import (
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
csilibplugins "k8s.io/csi-translation-lib/plugins"
"k8s.io/kubernetes/pkg/features"
)
// isCSIMigrationOn returns a boolean value indicating whether
// the CSI migration has been enabled for a particular storage plugin.
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string, enableCSIMigrationPortworx bool) bool {
if csiNode == nil || len(pluginName) == 0 {
return false
}
@ -40,7 +38,7 @@ func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
case csilibplugins.AWSEBSInTreePluginName:
return true
case csilibplugins.PortworxVolumePluginName:
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) {
if !enableCSIMigrationPortworx {
return false
}
case csilibplugins.GCEPDInTreePluginName:

View File

@ -27,11 +27,6 @@ import (
"k8s.io/utils/ptr"
)
type topologyPair struct {
key string
value string
}
// topologySpreadConstraint is an internal version for v1.TopologySpreadConstraint
// and where the selector is parsed.
// Fields are exported for comparison during testing.

View File

@ -19,6 +19,7 @@ package podtopologyspread
import (
"context"
"fmt"
"maps"
"math"
v1 "k8s.io/api/core/v1"
@ -31,7 +32,7 @@ import (
const preFilterStateKey = "PreFilter" + Name
// preFilterState computed at PreFilter and used at Filter.
// It combines TpKeyToCriticalPaths and TpPairToMatchNum to represent:
// It combines CriticalPaths and TpValueToMatchNum to represent:
// (1) critical paths where the least pods are matched on each spread constraint.
// (2) number of pods matched on each spread constraint.
// A nil preFilterState denotes it's not set at all (in PreFilter phase);
@ -39,29 +40,23 @@ const preFilterStateKey = "PreFilter" + Name
// Fields are exported for comparison during testing.
type preFilterState struct {
Constraints []topologySpreadConstraint
// We record 2 critical paths instead of all critical paths here.
// criticalPaths[0].MatchNum always holds the minimum matching number.
// criticalPaths[1].MatchNum is always greater or equal to criticalPaths[0].MatchNum, but
// CriticalPaths is a slice indexed by constraint index.
// Per each entry, we record 2 critical paths instead of all critical paths.
// CriticalPaths[i][0].MatchNum always holds the minimum matching number.
// CriticalPaths[i][1].MatchNum is always greater or equal to CriticalPaths[i][0].MatchNum, but
// it's not guaranteed to be the 2nd minimum match number.
TpKeyToCriticalPaths map[string]*criticalPaths
// TpKeyToDomainsNum is keyed with topologyKey, and valued with the number of domains.
TpKeyToDomainsNum map[string]int
// TpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
TpPairToMatchNum map[topologyPair]int
CriticalPaths []*criticalPaths
// TpValueToMatchNum is a slice indexed by constraint index.
// Each entry is keyed with topology value, and valued with the number of matching pods.
TpValueToMatchNum []map[string]int
}
// minMatchNum returns the global minimum for the calculation of skew while taking MinDomains into account.
func (s *preFilterState) minMatchNum(tpKey string, minDomains int32) (int, error) {
paths, ok := s.TpKeyToCriticalPaths[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve path by topology key")
}
func (s *preFilterState) minMatchNum(constraintID int, minDomains int32) (int, error) {
paths := s.CriticalPaths[constraintID]
minMatchNum := paths[0].MatchNum
domainsNum, ok := s.TpKeyToDomainsNum[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve the number of domains by topology key")
}
domainsNum := len(s.TpValueToMatchNum[constraintID])
if domainsNum < int(minDomains) {
// When the number of eligible domains with matching topology keys is less than `minDomains`,
@ -79,17 +74,15 @@ func (s *preFilterState) Clone() framework.StateData {
}
copy := preFilterState{
// Constraints are shared because they don't change.
Constraints: s.Constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(s.TpKeyToCriticalPaths)),
// The number of domains does not change as a result of AddPod/RemovePod methods on PreFilter Extensions
TpKeyToDomainsNum: s.TpKeyToDomainsNum,
TpPairToMatchNum: make(map[topologyPair]int, len(s.TpPairToMatchNum)),
Constraints: s.Constraints,
CriticalPaths: make([]*criticalPaths, len(s.CriticalPaths)),
TpValueToMatchNum: make([]map[string]int, len(s.TpValueToMatchNum)),
}
for tpKey, paths := range s.TpKeyToCriticalPaths {
copy.TpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
for i, paths := range s.CriticalPaths {
copy.CriticalPaths[i] = &criticalPaths{paths[0], paths[1]}
}
for tpPair, matchNum := range s.TpPairToMatchNum {
copy.TpPairToMatchNum[tpPair] = matchNum
for i, tpMap := range s.TpValueToMatchNum {
copy.TpValueToMatchNum[i] = maps.Clone(tpMap)
}
return &copy
}
@ -200,7 +193,7 @@ func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemp
}
podLabelSet := labels.Set(updatedPod.Labels)
for _, constraint := range s.Constraints {
for i, constraint := range s.Constraints {
if !constraint.Selector.Matches(podLabelSet) {
continue
}
@ -210,10 +203,9 @@ func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemp
continue
}
k, v := constraint.TopologyKey, node.Labels[constraint.TopologyKey]
pair := topologyPair{key: k, value: v}
s.TpPairToMatchNum[pair] += delta
s.TpKeyToCriticalPaths[k].update(v, s.TpPairToMatchNum[pair])
v := node.Labels[constraint.TopologyKey]
s.TpValueToMatchNum[i][v] += delta
s.CriticalPaths[i].update(v, s.TpValueToMatchNum[i][v])
}
}
@ -232,6 +224,12 @@ func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error
return s, nil
}
type topologyCount struct {
topologyValue string
constraintID int
count int
}
// calPreFilterState computes preFilterState describing how pods are spread on topologies.
func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) (*preFilterState, error) {
constraints, err := pl.getConstraints(pod)
@ -248,15 +246,18 @@ func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod)
}
s := preFilterState{
Constraints: constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
TpPairToMatchNum: make(map[topologyPair]int, sizeHeuristic(len(allNodes), constraints)),
Constraints: constraints,
CriticalPaths: make([]*criticalPaths, len(constraints)),
TpValueToMatchNum: make([]map[string]int, len(constraints)),
}
for i := 0; i < len(constraints); i++ {
s.TpValueToMatchNum[i] = make(map[string]int, sizeHeuristic(len(allNodes), constraints[i]))
}
tpCountsByNode := make([]map[topologyPair]int, len(allNodes))
tpCountsByNode := make([][]topologyCount, len(allNodes))
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processNode := func(i int) {
nodeInfo := allNodes[i]
processNode := func(n int) {
nodeInfo := allNodes[n]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
@ -272,38 +273,39 @@ func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod)
return
}
tpCounts := make(map[topologyPair]int, len(constraints))
for _, c := range constraints {
tpCounts := make([]topologyCount, 0, len(constraints))
for i, c := range constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
value := node.Labels[c.TopologyKey]
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
tpCounts[pair] = count
tpCounts = append(tpCounts, topologyCount{
topologyValue: value,
constraintID: i,
count: count,
})
}
tpCountsByNode[i] = tpCounts
tpCountsByNode[n] = tpCounts
}
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for _, tpCounts := range tpCountsByNode {
for tp, count := range tpCounts {
s.TpPairToMatchNum[tp] += count
// tpCounts might not hold all the constraints, so index can't be used here as constraintID.
for _, tpCount := range tpCounts {
s.TpValueToMatchNum[tpCount.constraintID][tpCount.topologyValue] += tpCount.count
}
}
s.TpKeyToDomainsNum = make(map[string]int, len(constraints))
for tp := range s.TpPairToMatchNum {
s.TpKeyToDomainsNum[tp.key]++
}
// calculate min match for each topology pair
// calculate min match for each constraint and topology value
for i := 0; i < len(constraints); i++ {
key := constraints[i].TopologyKey
s.TpKeyToCriticalPaths[key] = newCriticalPaths()
}
for pair, num := range s.TpPairToMatchNum {
s.TpKeyToCriticalPaths[pair.key].update(pair.value, num)
s.CriticalPaths[i] = newCriticalPaths()
for value, num := range s.TpValueToMatchNum[i] {
s.CriticalPaths[i].update(value, num)
}
}
return &s, nil
@ -325,19 +327,19 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
logger := klog.FromContext(ctx)
podLabelSet := labels.Set(pod.Labels)
for _, c := range s.Constraints {
for i, c := range s.Constraints {
tpKey := c.TopologyKey
tpVal, ok := node.Labels[c.TopologyKey]
tpVal, ok := node.Labels[tpKey]
if !ok {
logger.V(5).Info("Node doesn't have required label", "node", klog.KObj(node), "label", tpKey)
logger.V(5).Info("Node doesn't have required topology label for spread constraint", "node", klog.KObj(node), "topologyKey", tpKey)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonNodeLabelNotMatch)
}
// judging criteria:
// 'existing matching num' + 'if self-match (1 or 0)' - 'global minimum' <= 'maxSkew'
minMatchNum, err := s.minMatchNum(tpKey, c.MinDomains)
minMatchNum, err := s.minMatchNum(i, c.MinDomains)
if err != nil {
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.TpKeyToCriticalPaths)
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.CriticalPaths[i])
continue
}
@ -346,11 +348,7 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
selfMatchNum = 1
}
pair := topologyPair{key: tpKey, value: tpVal}
matchNum := 0
if tpCount, ok := s.TpPairToMatchNum[pair]; ok {
matchNum = tpCount
}
matchNum := s.TpValueToMatchNum[i][tpVal]
skew := matchNum + selfMatchNum - minMatchNum
if skew > int(c.MaxSkew) {
logger.V(5).Info("Node failed spreadConstraint: matchNum + selfMatchNum - minMatchNum > maxSkew", "node", klog.KObj(node), "topologyKey", tpKey, "matchNum", matchNum, "selfMatchNum", selfMatchNum, "minMatchNum", minMatchNum, "maxSkew", c.MaxSkew)
@ -361,11 +359,9 @@ func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.C
return nil
}
func sizeHeuristic(nodes int, constraints []topologySpreadConstraint) int {
for _, c := range constraints {
if c.TopologyKey == v1.LabelHostname {
return nodes
}
func sizeHeuristic(nodes int, constraint topologySpreadConstraint) int {
if constraint.TopologyKey == v1.LabelHostname {
return nodes
}
return 0
}

View File

@ -146,8 +146,8 @@ func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.Cl
//
// The Pod rejected by this plugin can be schedulable when the Pod has a spread constraint with NodeTaintsPolicy:Honor
// and has got a new toleration.
// So, we add UpdatePodTolerations here only when QHint is enabled.
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodTolerations | framework.Delete
// So, we add UpdatePodToleration here only when QHint is enabled.
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodToleration | framework.Delete
}
return []framework.ClusterEventWithHint{

View File

@ -37,8 +37,9 @@ type preScoreState struct {
Constraints []topologySpreadConstraint
// IgnoredNodes is a set of node names which miss some Constraints[*].topologyKey.
IgnoredNodes sets.Set[string]
// TopologyPairToPodCounts is keyed with topologyPair, and valued with the number of matching pods.
TopologyPairToPodCounts map[topologyPair]*int64
// TopologyValueToPodCounts is a slice indexed by constraint index.
// Each entry is keyed with topology value, and valued with the number of matching pods.
TopologyValueToPodCounts []map[string]*int64
// TopologyNormalizingWeight is the weight we give to the counts per topology.
// This allows the pod counts of smaller topologies to not be watered down by
// bigger ones.
@ -76,6 +77,10 @@ func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, fi
if len(s.Constraints) == 0 {
return nil
}
s.TopologyValueToPodCounts = make([]map[string]*int64, len(s.Constraints))
for i := 0; i < len(s.Constraints); i++ {
s.TopologyValueToPodCounts[i] = make(map[string]*int64)
}
topoSize := make([]int, len(s.Constraints))
for _, node := range filteredNodes {
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Node().Labels, s.Constraints) {
@ -89,9 +94,9 @@ func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, fi
if constraint.TopologyKey == v1.LabelHostname {
continue
}
pair := topologyPair{key: constraint.TopologyKey, value: node.Node().Labels[constraint.TopologyKey]}
if s.TopologyPairToPodCounts[pair] == nil {
s.TopologyPairToPodCounts[pair] = new(int64)
value := node.Node().Labels[constraint.TopologyKey]
if s.TopologyValueToPodCounts[i][value] == nil {
s.TopologyValueToPodCounts[i][value] = new(int64)
topoSize[i]++
}
}
@ -126,8 +131,7 @@ func (pl *PodTopologySpread) PreScore(
}
state := &preScoreState{
IgnoredNodes: sets.New[string](),
TopologyPairToPodCounts: make(map[topologyPair]*int64),
IgnoredNodes: sets.New[string](),
}
// Only require that nodes have all the topology labels if using
// non-system-default spreading rules. This allows nodes that don't have a
@ -145,8 +149,8 @@ func (pl *PodTopologySpread) PreScore(
// Ignore parsing errors for backwards compatibility.
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processAllNode := func(i int) {
nodeInfo := allNodes[i]
processAllNode := func(n int) {
nodeInfo := allNodes[n]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
@ -161,17 +165,17 @@ func (pl *PodTopologySpread) PreScore(
return
}
for _, c := range state.Constraints {
for i, c := range state.Constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
value := node.Labels[c.TopologyKey]
// If current topology pair is not associated with any candidate node,
// continue to avoid unnecessary calculation.
// Per-node counts are also skipped, as they are done during Score.
tpCount := state.TopologyPairToPodCounts[pair]
tpCount := state.TopologyValueToPodCounts[i][value]
if tpCount == nil {
continue
}
@ -188,12 +192,7 @@ func (pl *PodTopologySpread) PreScore(
// Score invoked at the Score extension point.
// The "score" returned in this function is the matching number of pods on the `nodeName`,
// it is normalized later.
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)
if err != nil {
@ -214,8 +213,7 @@ func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.Cy
if c.TopologyKey == v1.LabelHostname {
cnt = int64(countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace))
} else {
pair := topologyPair{key: c.TopologyKey, value: tpVal}
cnt = *s.TopologyPairToPodCounts[pair]
cnt = *s.TopologyValueToPodCounts[i][tpVal]
}
score += scoreForCount(cnt, c.MaxSkew, s.TopologyNormalizingWeight[i])
}

View File

@ -46,9 +46,12 @@ import (
// through the WithFrameworkOutOfTreeRegistry option.
func NewInTreeRegistry() runtime.Registry {
fts := plfeature.Features{
EnableDRAPrioritizedList: feature.DefaultFeatureGate.Enabled(features.DRAPrioritizedList),
EnableDRAAdminAccess: feature.DefaultFeatureGate.Enabled(features.DRAAdminAccess),
EnableDRADeviceTaints: feature.DefaultFeatureGate.Enabled(features.DRADeviceTaints),
EnableDynamicResourceAllocation: feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
EnableVolumeCapacityPriority: feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
EnableVolumeAttributesClass: feature.DefaultFeatureGate.Enabled(features.VolumeAttributesClass),
EnableCSIMigrationPortworx: feature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx),
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
@ -56,6 +59,8 @@ func NewInTreeRegistry() runtime.Registry {
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
EnablePodLevelResources: feature.DefaultFeatureGate.Enabled(features.PodLevelResources),
EnablePartitionableDevices: feature.DefaultFeatureGate.Enabled(features.DRAPartitionableDevices),
EnableStorageCapacityScoring: feature.DefaultFeatureGate.Enabled(features.StorageCapacityScoring),
}
registry := runtime.Registry{

View File

@ -67,7 +67,7 @@ func (pl *TaintToleration) EventsToRegister(_ context.Context) ([]framework.Clus
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodToleration}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
@ -143,9 +143,6 @@ func getAllTolerationPreferNoSchedule(tolerations []v1.Toleration) (tolerationLi
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *TaintToleration) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
tolerationsPreferNoSchedule := getAllTolerationPreferNoSchedule(pod.Spec.Tolerations)
state := &preScoreState{
tolerationsPreferNoSchedule: tolerationsPreferNoSchedule,
@ -183,11 +180,7 @@ func countIntolerableTaintsPreferNoSchedule(taints []v1.Taint, tolerations []v1.
}
// Score invoked at the Score extension point.
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
node := nodeInfo.Node()
s, err := getPreScoreState(state)

View File

@ -33,7 +33,6 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/storage"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
clientset "k8s.io/client-go/kubernetes"
@ -45,7 +44,7 @@ import (
csiplugins "k8s.io/csi-translation-lib/plugins"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
@ -65,7 +64,7 @@ const (
// ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error.
ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind"
// ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error.
ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict"
ErrReasonNodeConflict ConflictReason = "node(s) didn't match PersistentVolume's node affinity"
// ErrReasonNotEnoughSpace is used when a pod cannot start on a node because not enough storage space is available.
ErrReasonNotEnoughSpace = "node(s) did not have enough free storage"
// ErrReasonPVNotExist is used when a pod has one or more PVC(s) bound to non-existent persistent volume(s)"
@ -103,13 +102,19 @@ func (b *BindingInfo) StorageResource() *StorageResource {
}
}
// DynamicProvision represents a dynamically provisioned volume.
type DynamicProvision struct {
PVC *v1.PersistentVolumeClaim
NodeCapacity *storagev1.CSIStorageCapacity
}
// PodVolumes holds pod's volumes information used in volume scheduling.
type PodVolumes struct {
// StaticBindings are binding decisions for PVCs which can be bound to
// pre-provisioned static PVs.
StaticBindings []*BindingInfo
// DynamicProvisions are PVCs that require dynamic provisioning
DynamicProvisions []*v1.PersistentVolumeClaim
DynamicProvisions []*DynamicProvision
}
// InTreeToCSITranslator contains methods required to check migratable status
@ -203,7 +208,9 @@ type PodVolumeClaims struct {
}
type volumeBinder struct {
kubeClient clientset.Interface
kubeClient clientset.Interface
enableVolumeAttributesClass bool
enableCSIMigrationPortworx bool
classLister storagelisters.StorageClassLister
podLister corelisters.PodLister
@ -238,6 +245,7 @@ type CapacityCheck struct {
func NewVolumeBinder(
logger klog.Logger,
kubeClient clientset.Interface,
fts feature.Features,
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
csiNodeInformer storageinformers.CSINodeInformer,
@ -247,15 +255,17 @@ func NewVolumeBinder(
capacityCheck CapacityCheck,
bindTimeout time.Duration) SchedulerVolumeBinder {
b := &volumeBinder{
kubeClient: kubeClient,
podLister: podInformer.Lister(),
classLister: storageClassInformer.Lister(),
nodeLister: nodeInformer.Lister(),
csiNodeLister: csiNodeInformer.Lister(),
pvcCache: NewPVCAssumeCache(logger, pvcInformer.Informer()),
pvCache: NewPVAssumeCache(logger, pvInformer.Informer()),
bindTimeout: bindTimeout,
translator: csitrans.New(),
kubeClient: kubeClient,
enableVolumeAttributesClass: fts.EnableVolumeAttributesClass,
enableCSIMigrationPortworx: fts.EnableCSIMigrationPortworx,
podLister: podInformer.Lister(),
classLister: storageClassInformer.Lister(),
nodeLister: nodeInformer.Lister(),
csiNodeLister: csiNodeInformer.Lister(),
pvcCache: NewPVCAssumeCache(logger, pvcInformer.Informer()),
pvCache: NewPVAssumeCache(logger, pvInformer.Informer()),
bindTimeout: bindTimeout,
translator: csitrans.New(),
}
b.csiDriverLister = capacityCheck.CSIDriverInformer.Lister()
@ -306,7 +316,7 @@ func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolume
var (
staticBindings []*BindingInfo
dynamicProvisions []*v1.PersistentVolumeClaim
dynamicProvisions []*DynamicProvision
)
defer func() {
// Although we do not distinguish nil from empty in this function, for
@ -373,6 +383,16 @@ func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolume
return
}
// ConvertDynamicProvisionsToPVCs converts a slice of *DynamicProvision to a
// slice of PersistentVolumeClaim
func convertDynamicProvisionsToPVCs(dynamicProvisions []*DynamicProvision) []*v1.PersistentVolumeClaim {
pvcs := make([]*v1.PersistentVolumeClaim, 0, len(dynamicProvisions))
for _, dynamicProvision := range dynamicProvisions {
pvcs = append(pvcs, dynamicProvision.PVC)
}
return pvcs
}
// AssumePodVolumes will take the matching PVs and PVCs to provision in pod's
// volume information for the chosen node, and:
// 1. Update the pvCache with the new prebound PV.
@ -419,20 +439,21 @@ func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod,
}
// Assume PVCs
newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
for _, claim := range podVolumes.DynamicProvisions {
newProvisionedPVCs := []*DynamicProvision{}
for _, dynamicProvision := range podVolumes.DynamicProvisions {
// The claims from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
claimClone := claim.DeepCopy()
claimClone := dynamicProvision.PVC.DeepCopy()
metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, volume.AnnSelectedNode, nodeName)
err = b.pvcCache.Assume(claimClone)
if err != nil {
pvcs := convertDynamicProvisionsToPVCs(newProvisionedPVCs)
b.revertAssumedPVs(newBindings)
b.revertAssumedPVCs(newProvisionedPVCs)
b.revertAssumedPVCs(pvcs)
return
}
newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
newProvisionedPVCs = append(newProvisionedPVCs, &DynamicProvision{PVC: claimClone})
}
podVolumes.StaticBindings = newBindings
@ -442,8 +463,9 @@ func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod,
// RevertAssumedPodVolumes will revert assumed PV and PVC cache.
func (b *volumeBinder) RevertAssumedPodVolumes(podVolumes *PodVolumes) {
pvcs := convertDynamicProvisionsToPVCs(podVolumes.DynamicProvisions)
b.revertAssumedPVs(podVolumes.StaticBindings)
b.revertAssumedPVCs(podVolumes.DynamicProvisions)
b.revertAssumedPVCs(pvcs)
}
// BindPodVolumes gets the cached bindings and PVCs to provision in pod's volumes information,
@ -460,7 +482,7 @@ func (b *volumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, p
}()
bindings := podVolumes.StaticBindings
claimsToProvision := podVolumes.DynamicProvisions
claimsToProvision := convertDynamicProvisionsToPVCs(podVolumes.DynamicProvisions)
// Start API operations
err = b.bindAPIUpdate(ctx, assumedPod, bindings, claimsToProvision)
@ -855,7 +877,7 @@ func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, clai
pvs := unboundVolumesDelayBinding[storageClassName]
// Find a matching PV
pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true, utilfeature.DefaultFeatureGate.Enabled(features.VolumeAttributesClass))
pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true, b.enableVolumeAttributesClass)
if err != nil {
return false, nil, nil, err
}
@ -882,8 +904,8 @@ func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, clai
// checkVolumeProvisions checks given unbound claims (the claims have gone through func
// findMatchingVolumes, and do not have matching volumes for binding), and return true
// if all of the claims are eligible for dynamic provision.
func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*v1.PersistentVolumeClaim, err error) {
dynamicProvisions = []*v1.PersistentVolumeClaim{}
func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*DynamicProvision, err error) {
dynamicProvisions = []*DynamicProvision{}
// We return early with provisionedClaims == nil if a check
// fails or we encounter an error.
@ -911,7 +933,7 @@ func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, cl
}
// Check storage capacity.
sufficient, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
sufficient, capacity, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
if err != nil {
return false, false, nil, err
}
@ -920,8 +942,10 @@ func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, cl
return true, false, nil, nil
}
dynamicProvisions = append(dynamicProvisions, claim)
dynamicProvisions = append(dynamicProvisions, &DynamicProvision{
PVC: claim,
NodeCapacity: capacity,
})
}
logger.V(4).Info("Provisioning for claims of pod that has no matching volumes...", "claimCount", len(claimsToProvision), "pod", klog.KObj(pod), "node", klog.KObj(node))
@ -941,12 +965,12 @@ func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
}
// hasEnoughCapacity checks whether the provisioner has enough capacity left for a new volume of the given size
// that is available from the node.
func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, error) {
// that is available from the node. This function returns the node capacity based on the PVC's storage class.
func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, *storagev1.CSIStorageCapacity, error) {
quantity, ok := claim.Spec.Resources.Requests[v1.ResourceStorage]
if !ok {
// No capacity to check for.
return true, nil
return true, nil, nil
}
// Only enabled for CSI drivers which opt into it.
@ -956,19 +980,19 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
// Either the provisioner is not a CSI driver or the driver does not
// opt into storage capacity scheduling. Either way, skip
// capacity checking.
return true, nil
return true, nil, nil
}
return false, err
return false, nil, err
}
if driver.Spec.StorageCapacity == nil || !*driver.Spec.StorageCapacity {
return true, nil
return true, nil, nil
}
// Look for a matching CSIStorageCapacity object(s).
// TODO (for beta): benchmark this and potentially introduce some kind of lookup structure (https://github.com/kubernetes/enhancements/issues/1698#issuecomment-654356718).
capacities, err := b.csiStorageCapacityLister.List(labels.Everything())
if err != nil {
return false, err
return false, nil, err
}
sizeInBytes := quantity.Value()
@ -977,7 +1001,7 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
capacitySufficient(capacity, sizeInBytes) &&
b.nodeHasAccess(logger, node, capacity) {
// Enough capacity found.
return true, nil
return true, capacity, nil
}
}
@ -985,7 +1009,7 @@ func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string,
// they had to be rejected. Log that above? But that might be a lot of log output...
logger.V(4).Info("Node has no accessible CSIStorageCapacity with enough capacity for PVC",
"node", klog.KObj(node), "PVC", klog.KObj(claim), "size", sizeInBytes, "storageClass", klog.KObj(storageClass))
return false, nil
return false, nil, nil
}
func capacitySufficient(capacity *storagev1.CSIStorageCapacity, sizeInBytes int64) bool {
@ -1033,7 +1057,7 @@ func (a byPVCSize) Less(i, j int) bool {
}
// isCSIMigrationOnForPlugin checks if CSI migration is enabled for a given plugin.
func isCSIMigrationOnForPlugin(pluginName string) bool {
func isCSIMigrationOnForPlugin(pluginName string, enableCSIMigrationPortworx bool) bool {
switch pluginName {
case csiplugins.AWSEBSInTreePluginName:
return true
@ -1044,7 +1068,7 @@ func isCSIMigrationOnForPlugin(pluginName string) bool {
case csiplugins.CinderInTreePluginName:
return true
case csiplugins.PortworxVolumePluginName:
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx)
return enableCSIMigrationPortworx
}
return false
}
@ -1083,7 +1107,7 @@ func (b *volumeBinder) tryTranslatePVToCSI(logger klog.Logger, pv *v1.Persistent
return nil, fmt.Errorf("could not get plugin name from pv: %v", err)
}
if !isCSIMigrationOnForPlugin(pluginName) {
if !isCSIMigrationOnForPlugin(pluginName, b.enableCSIMigrationPortworx) {
return pv, nil
}

View File

@ -29,6 +29,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
corelisters "k8s.io/client-go/listers/core/v1"
storagelisters "k8s.io/client-go/listers/storage/v1"
"k8s.io/component-helpers/storage/ephemeral"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
@ -70,10 +71,11 @@ func (d *stateData) Clone() framework.StateData {
// In the Filter phase, pod binding cache is created for the pod and used in
// Reserve and PreBind phases.
type VolumeBinding struct {
Binder SchedulerVolumeBinder
PVCLister corelisters.PersistentVolumeClaimLister
scorer volumeCapacityScorer
fts feature.Features
Binder SchedulerVolumeBinder
PVCLister corelisters.PersistentVolumeClaimLister
classLister storagelisters.StorageClassLister
scorer volumeCapacityScorer
fts feature.Features
}
var _ framework.PreFilterPlugin = &VolumeBinding{}
@ -451,14 +453,14 @@ func (pl *VolumeBinding) PreScore(ctx context.Context, cs *framework.CycleState,
if err != nil {
return framework.AsStatus(err)
}
if state.hasStaticBindings {
if state.hasStaticBindings || pl.fts.EnableStorageCapacityScoring {
return nil
}
return framework.NewStatus(framework.Skip)
}
// Score invoked at the score extension point.
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
if pl.scorer == nil {
return 0, nil
}
@ -466,24 +468,49 @@ func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, po
if err != nil {
return 0, framework.AsStatus(err)
}
nodeName := nodeInfo.Node().Name
podVolumes, ok := state.podVolumesByNode[nodeName]
if !ok {
return 0, nil
}
// group by storage class
classResources := make(classResourceMap)
for _, staticBinding := range podVolumes.StaticBindings {
class := staticBinding.StorageClassName()
storageResource := staticBinding.StorageResource()
if _, ok := classResources[class]; !ok {
classResources[class] = &StorageResource{
Requested: 0,
Capacity: 0,
if len(podVolumes.StaticBindings) != 0 || !pl.fts.EnableStorageCapacityScoring {
// group static binding volumes by storage class
for _, staticBinding := range podVolumes.StaticBindings {
class := staticBinding.StorageClassName()
storageResource := staticBinding.StorageResource()
if _, ok := classResources[class]; !ok {
classResources[class] = &StorageResource{
Requested: 0,
Capacity: 0,
}
}
classResources[class].Requested += storageResource.Requested
classResources[class].Capacity += storageResource.Capacity
}
} else {
// group dynamic binding volumes by storage class
for _, provision := range podVolumes.DynamicProvisions {
if provision.NodeCapacity == nil {
continue
}
class := *provision.PVC.Spec.StorageClassName
if _, ok := classResources[class]; !ok {
classResources[class] = &StorageResource{
Requested: 0,
Capacity: 0,
}
}
// The following line cannot be +=. For example, if a Pod requests two 50GB volumes from
// a StorageClass with 100GB of capacity on a node, this part of the code will be executed twice.
// In that case, using += would incorrectly set classResources[class].Capacity to 200GB.
classResources[class].Capacity = provision.NodeCapacity.Capacity.Value()
requestedQty := provision.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
classResources[class].Requested += requestedQty.Value()
}
classResources[class].Requested += storageResource.Requested
classResources[class].Capacity += storageResource.Capacity
}
return pl.scorer(classResources), nil
}
@ -565,7 +592,7 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
}
if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
AllowStorageCapacityScoring: fts.EnableStorageCapacityScoring,
}); err != nil {
return nil, err
}
@ -579,11 +606,11 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1().CSIStorageCapacities(),
}
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), fts, podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
// build score function
var scorer volumeCapacityScorer
if fts.EnableVolumeCapacityPriority {
if fts.EnableStorageCapacityScoring {
shape := make(helper.FunctionShape, 0, len(args.Shape))
for _, point := range args.Shape {
shape = append(shape, helper.FunctionShapePoint{
@ -594,9 +621,10 @@ func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts fe
scorer = buildScorerFunction(shape)
}
return &VolumeBinding{
Binder: binder,
PVCLister: pvcInformer.Lister(),
scorer: scorer,
fts: fts,
Binder: binder,
PVCLister: pvcInformer.Lister(),
classLister: storageClassInformer.Lister(),
scorer: scorer,
fts: fts,
}, nil
}

View File

@ -40,7 +40,6 @@ import (
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
)
@ -149,7 +148,7 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsy
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
ev := &Evaluator{
PluginName: names.DefaultPreemption,
PluginName: pluginName,
Handler: fh,
PodLister: podLister,
PdbLister: pdbLister,
@ -172,10 +171,11 @@ func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsy
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
} else {
condition := &v1.PodCondition{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
Reason: v1.PodReasonPreemptionByScheduler,
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
Type: v1.DisruptionTarget,
ObservedGeneration: apipod.GetPodObservedGenerationIfEnabledOnCondition(&victim.Status, victim.Generation, v1.DisruptionTarget),
Status: v1.ConditionTrue,
Reason: v1.PodReasonPreemptionByScheduler,
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
}
newStatus := victim.Status.DeepCopy()
updated := apipod.UpdatePodCondition(newStatus, condition)
@ -261,6 +261,7 @@ func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, p
// Return a FitError only when there are no candidates that fit the pod.
if len(candidates) == 0 {
logger.V(2).Info("No preemption candidate is found; preemption is not helpful for scheduling", "pod", klog.KObj(pod))
fitError := &framework.FitError{
Pod: pod,
NumAllNodes: len(allNodes),

View File

@ -1011,7 +1011,7 @@ func (f *frameworkImpl) RunFilterPluginsWithNominatedPods(ctx context.Context, s
nodeInfoToUse := info
if i == 0 {
var err error
podsAdded, stateToUse, nodeInfoToUse, err = addNominatedPods(ctx, f, pod, state, info)
podsAdded, stateToUse, nodeInfoToUse, err = addGENominatedPods(ctx, f, pod, state, info)
if err != nil {
return framework.AsStatus(err)
}
@ -1028,10 +1028,10 @@ func (f *frameworkImpl) RunFilterPluginsWithNominatedPods(ctx context.Context, s
return status
}
// addNominatedPods adds pods with equal or greater priority which are nominated
// addGENominatedPods adds pods with equal or greater priority which are nominated
// to run on the node. It returns 1) whether any pod was added, 2) augmented cycleState,
// 3) augmented nodeInfo.
func addNominatedPods(ctx context.Context, fh framework.Handle, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) {
func addGENominatedPods(ctx context.Context, fh framework.Handle, pod *v1.Pod, state *framework.CycleState, nodeInfo *framework.NodeInfo) (bool, *framework.CycleState, *framework.NodeInfo, error) {
if fh == nil {
// This may happen only in tests.
return false, state, nodeInfo, nil
@ -1137,7 +1137,8 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
}
// Run Score method for each node in parallel.
f.Parallelizer().Until(ctx, len(nodes), func(index int) {
nodeName := nodes[index].Node().Name
nodeInfo := nodes[index]
nodeName := nodeInfo.Node().Name
logger := logger
if verboseLogs {
logger = klog.LoggerWithValues(logger, "node", klog.ObjectRef{Name: nodeName})
@ -1148,7 +1149,7 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
logger := klog.LoggerWithName(logger, pl.Name())
ctx = klog.NewContext(ctx, logger)
}
s, status := f.runScorePlugin(ctx, pl, state, pod, nodeName)
s, status := f.runScorePlugin(ctx, pl, state, pod, nodeInfo)
if !status.IsSuccess() {
err := fmt.Errorf("plugin %q failed with: %w", pl.Name(), status.AsError())
errCh.SendErrorWithCancel(err, cancel)
@ -1217,12 +1218,12 @@ func (f *frameworkImpl) RunScorePlugins(ctx context.Context, state *framework.Cy
return allNodePluginScores, nil
}
func (f *frameworkImpl) runScorePlugin(ctx context.Context, pl framework.ScorePlugin, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
func (f *frameworkImpl) runScorePlugin(ctx context.Context, pl framework.ScorePlugin, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
if !state.ShouldRecordPluginMetrics() {
return pl.Score(ctx, state, pod, nodeName)
return pl.Score(ctx, state, pod, nodeInfo)
}
startTime := time.Now()
s, status := pl.Score(ctx, state, pod, nodeName)
s, status := pl.Score(ctx, state, pod, nodeInfo)
f.metricsRecorder.ObservePluginDurationAsync(metrics.Score, pl.Name(), status.Code().String(), metrics.SinceInSeconds(startTime))
return s, status
}

View File

@ -77,7 +77,7 @@ type instrumentedScorePlugin struct {
var _ framework.ScorePlugin = &instrumentedScorePlugin{}
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) (int64, *framework.Status) {
p.metric.Inc()
return p.ScorePlugin.Score(ctx, state, pod, nodeName)
return p.ScorePlugin.Score(ctx, state, pod, nodeInfo)
}

View File

@ -72,9 +72,9 @@ const (
UpdatePodLabel
// UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced).
UpdatePodScaleDown
// UpdatePodTolerations is an addition for pod's tolerations.
// UpdatePodToleration is an addition for pod's tolerations.
// (Due to API validation, we can add, but cannot modify or remove tolerations.)
UpdatePodTolerations
UpdatePodToleration
// UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod.
UpdatePodSchedulingGatesEliminated
// UpdatePodGeneratedResourceClaim is an update of the list of ResourceClaims generated for the pod.
@ -88,7 +88,7 @@ const (
All ActionType = 1<<iota - 1
// Use the general Update type if you don't either know or care the specific sub-Update type to use.
Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation | UpdatePodLabel | UpdatePodScaleDown | UpdatePodTolerations | UpdatePodSchedulingGatesEliminated | UpdatePodGeneratedResourceClaim | updatePodOther
Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation | UpdatePodLabel | UpdatePodScaleDown | UpdatePodToleration | UpdatePodSchedulingGatesEliminated | UpdatePodGeneratedResourceClaim | updatePodOther
// none is a special ActionType that is only used internally.
none ActionType = 0
)
@ -97,7 +97,7 @@ var (
// basicActionTypes is a list of basicActionTypes ActionTypes.
basicActionTypes = []ActionType{Add, Delete, Update}
// podActionTypes is a list of ActionTypes that are only applicable for Pod events.
podActionTypes = []ActionType{UpdatePodLabel, UpdatePodScaleDown, UpdatePodTolerations, UpdatePodSchedulingGatesEliminated, UpdatePodGeneratedResourceClaim}
podActionTypes = []ActionType{UpdatePodLabel, UpdatePodScaleDown, UpdatePodToleration, UpdatePodSchedulingGatesEliminated, UpdatePodGeneratedResourceClaim}
// nodeActionTypes is a list of ActionTypes that are only applicable for Node events.
nodeActionTypes = []ActionType{UpdateNodeAllocatable, UpdateNodeLabel, UpdateNodeTaint, UpdateNodeCondition, UpdateNodeAnnotation}
)
@ -122,8 +122,8 @@ func (a ActionType) String() string {
return "UpdatePodLabel"
case UpdatePodScaleDown:
return "UpdatePodScaleDown"
case UpdatePodTolerations:
return "UpdatePodTolerations"
case UpdatePodToleration:
return "UpdatePodToleration"
case UpdatePodSchedulingGatesEliminated:
return "UpdatePodSchedulingGatesEliminated"
case UpdatePodGeneratedResourceClaim:
@ -366,6 +366,11 @@ type QueuedPodInfo struct {
// Number of schedule attempts before successfully scheduled.
// It's used to record the # attempts metric and calculate the backoff time this Pod is obliged to get before retrying.
Attempts int
// BackoffExpiration is the time when the Pod will complete its backoff.
// If the SchedulerPopFromBackoffQ feature is enabled, the value is aligned to the backoff ordering window.
// Then, two Pods with the same BackoffExpiration (time bucket) are ordered by priority and eventually the timestamp,
// to make sure popping from the backoffQ considers priority of pods that are close to the expiration time.
BackoffExpiration time.Time
// The time when the pod is added to the queue for the first time. The pod may be added
// back to the queue multiple times before it's successfully scheduled.
// It shouldn't be updated once initialized. It's used to record the e2e scheduling
@ -397,6 +402,13 @@ func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo {
}
}
// podResource contains the result of calculateResource and is used only internally.
type podResource struct {
resource Resource
non0CPU int64
non0Mem int64
}
// PodInfo is a wrapper to a Pod with additional pre-computed information to
// accelerate processing. This information is typically immutable (e.g., pre-processed
// inter-pod affinity selectors).
@ -406,6 +418,15 @@ type PodInfo struct {
RequiredAntiAffinityTerms []AffinityTerm
PreferredAffinityTerms []WeightedAffinityTerm
PreferredAntiAffinityTerms []WeightedAffinityTerm
// cachedResource contains precomputed resources for Pod (podResource).
// The value can change only if InPlacePodVerticalScaling is enabled.
// In that case, the whole PodInfo object is recreated (for assigned pods in cache).
// cachedResource contains a podResource, computed when adding a scheduled pod to NodeInfo.
// When removing a pod from a NodeInfo, i.e. finding victims for preemption or removing a pod from a cluster,
// cachedResource is used instead, what provides a noticeable performance boost.
// Note: cachedResource field shouldn't be accessed directly.
// Use calculateResource method to obtain it instead.
cachedResource *podResource
}
// DeepCopy returns a deep copy of the PodInfo object.
@ -416,6 +437,7 @@ func (pi *PodInfo) DeepCopy() *PodInfo {
RequiredAntiAffinityTerms: pi.RequiredAntiAffinityTerms,
PreferredAffinityTerms: pi.PreferredAffinityTerms,
PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms,
cachedResource: pi.cachedResource,
}
}
@ -464,6 +486,7 @@ func (pi *PodInfo) Update(pod *v1.Pod) error {
pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms
pi.PreferredAffinityTerms = weightedAffinityTerms
pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms
pi.cachedResource = nil
return utilerrors.NewAggregate(parseErrs)
}
@ -963,7 +986,7 @@ func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) {
if podWithRequiredAntiAffinity(podInfo.Pod) {
n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo)
}
n.update(podInfo.Pod, 1)
n.update(podInfo, 1)
}
// AddPod is a wrapper around AddPodInfo.
@ -985,8 +1008,8 @@ func podWithRequiredAntiAffinity(p *v1.Pod) bool {
len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0
}
func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bool) {
var removed bool
func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, *PodInfo) {
var removedPod *PodInfo
for i := range s {
tmpKey, err := GetPodKey(s[i].Pod)
if err != nil {
@ -994,18 +1017,18 @@ func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bo
continue
}
if k == tmpKey {
removedPod = s[i]
// delete the element
s[i] = s[len(s)-1]
s = s[:len(s)-1]
removed = true
break
}
}
// resets the slices to nil so that we can do DeepEqual in unit tests.
if len(s) == 0 {
return nil, removed
return nil, removedPod
}
return s, removed
return s, removedPod
}
// RemovePod subtracts pod information from this NodeInfo.
@ -1021,33 +1044,33 @@ func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error {
n.PodsWithRequiredAntiAffinity, _ = removeFromSlice(logger, n.PodsWithRequiredAntiAffinity, k)
}
var removed bool
if n.Pods, removed = removeFromSlice(logger, n.Pods, k); removed {
n.update(pod, -1)
var removedPod *PodInfo
if n.Pods, removedPod = removeFromSlice(logger, n.Pods, k); removedPod != nil {
n.update(removedPod, -1)
return nil
}
return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name)
}
// update node info based on the pod and sign.
// update node info based on the pod, and sign.
// The sign will be set to `+1` when AddPod and to `-1` when RemovePod.
func (n *NodeInfo) update(pod *v1.Pod, sign int64) {
res, non0CPU, non0Mem := calculateResource(pod)
n.Requested.MilliCPU += sign * res.MilliCPU
n.Requested.Memory += sign * res.Memory
n.Requested.EphemeralStorage += sign * res.EphemeralStorage
if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 {
func (n *NodeInfo) update(podInfo *PodInfo, sign int64) {
podResource := podInfo.calculateResource()
n.Requested.MilliCPU += sign * podResource.resource.MilliCPU
n.Requested.Memory += sign * podResource.resource.Memory
n.Requested.EphemeralStorage += sign * podResource.resource.EphemeralStorage
if n.Requested.ScalarResources == nil && len(podResource.resource.ScalarResources) > 0 {
n.Requested.ScalarResources = map[v1.ResourceName]int64{}
}
for rName, rQuant := range res.ScalarResources {
for rName, rQuant := range podResource.resource.ScalarResources {
n.Requested.ScalarResources[rName] += sign * rQuant
}
n.NonZeroRequested.MilliCPU += sign * non0CPU
n.NonZeroRequested.Memory += sign * non0Mem
n.NonZeroRequested.MilliCPU += sign * podResource.non0CPU
n.NonZeroRequested.Memory += sign * podResource.non0Mem
// Consume ports when pod added or release ports when pod removed.
n.updateUsedPorts(pod, sign > 0)
n.updatePVCRefCounts(pod, sign > 0)
n.updateUsedPorts(podInfo.Pod, sign > 0)
n.updatePVCRefCounts(podInfo.Pod, sign > 0)
n.Generation = nextGeneration()
}
@ -1103,20 +1126,25 @@ func getNonMissingContainerRequests(requests v1.ResourceList, podLevelResourcesS
}
func calculateResource(pod *v1.Pod) (Resource, int64, int64) {
requests := resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
func (pi *PodInfo) calculateResource() podResource {
if pi.cachedResource != nil {
return *pi.cachedResource
}
inPlacePodVerticalScalingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling)
podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources)
requests := resourcehelper.PodRequests(pi.Pod, resourcehelper.PodResourcesOptions{
UseStatusResources: inPlacePodVerticalScalingEnabled,
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
SkipPodLevelResources: !podLevelResourcesEnabled,
})
isPodLevelResourcesSet := utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources) && resourcehelper.IsPodLevelRequestsSet(pod)
isPodLevelResourcesSet := podLevelResourcesEnabled && resourcehelper.IsPodLevelRequestsSet(pi.Pod)
nonMissingContainerRequests := getNonMissingContainerRequests(requests, isPodLevelResourcesSet)
non0Requests := requests
if len(nonMissingContainerRequests) > 0 {
non0Requests = resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
non0Requests = resourcehelper.PodRequests(pi.Pod, resourcehelper.PodResourcesOptions{
UseStatusResources: inPlacePodVerticalScalingEnabled,
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
SkipPodLevelResources: !podLevelResourcesEnabled,
NonMissingContainerRequests: nonMissingContainerRequests,
})
}
@ -1125,7 +1153,13 @@ func calculateResource(pod *v1.Pod) (Resource, int64, int64) {
var res Resource
res.Add(requests)
return res, non0CPU.MilliValue(), non0Mem.Value()
podResource := podResource{
resource: res,
non0CPU: non0CPU.MilliValue(),
non0Mem: non0Mem.Value(),
}
pi.cachedResource = &podResource
return podResource
}
// updateUsedPorts updates the UsedPorts of NodeInfo.

View File

@ -102,16 +102,16 @@ var (
InFlightEvents *metrics.GaugeVec
Goroutines *metrics.GaugeVec
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
// in v1.31. Please use PodSchedulingSLIDuration instead.
PodSchedulingDuration *metrics.HistogramVec
PodSchedulingSLIDuration *metrics.HistogramVec
PodSchedulingAttempts *metrics.Histogram
FrameworkExtensionPointDuration *metrics.HistogramVec
PluginExecutionDuration *metrics.HistogramVec
PermitWaitDuration *metrics.HistogramVec
CacheSize *metrics.GaugeVec
PermitWaitDuration *metrics.HistogramVec
CacheSize *metrics.GaugeVec
// Deprecated: SchedulerCacheSize is deprecated,
// and will be removed at v1.34. Please use CacheSize instead.
SchedulerCacheSize *metrics.GaugeVec
unschedulableReasons *metrics.GaugeVec
PluginEvaluationTotal *metrics.CounterVec
@ -220,20 +220,6 @@ func InitMetrics() {
StabilityLevel: metrics.ALPHA,
}, []string{"operation"})
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
// in v1.31. Please use PodSchedulingSLIDuration instead.
PodSchedulingDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "pod_scheduling_duration_seconds",
Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
// Start with 10ms with the last bucket being [~88m, Inf).
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
StabilityLevel: metrics.STABLE,
DeprecatedVersion: "1.29.0",
},
[]string{"attempts"})
PodSchedulingSLIDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
@ -308,10 +294,19 @@ func InitMetrics() {
},
[]string{"result"})
SchedulerCacheSize = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduler_cache_size",
Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
StabilityLevel: metrics.ALPHA,
DeprecatedVersion: "1.33.0",
}, []string{"type"})
CacheSize = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduler_cache_size",
Name: "cache_size",
Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
StabilityLevel: metrics.ALPHA,
}, []string{"type"})
@ -359,7 +354,6 @@ func InitMetrics() {
PreemptionVictims,
PreemptionAttempts,
pendingPods,
PodSchedulingDuration,
PodSchedulingSLIDuration,
PodSchedulingAttempts,
FrameworkExtensionPointDuration,
@ -368,6 +362,7 @@ func InitMetrics() {
Goroutines,
PermitWaitDuration,
CacheSize,
SchedulerCacheSize,
unschedulableReasons,
PluginEvaluationTotal,
}

View File

@ -22,7 +22,7 @@ import (
"errors"
"fmt"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp" //nolint:depguard
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/scheme"

View File

@ -292,30 +292,19 @@ func (sched *Scheduler) bindingCycle(
return status
}
// Run "prebind" plugins.
if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
if status.IsRejected() {
fitErr := &framework.FitError{
NumAllNodes: 1,
Pod: assumedPodInfo.Pod,
Diagnosis: framework.Diagnosis{
NodeToStatus: framework.NewDefaultNodeToStatus(),
UnschedulablePlugins: sets.New(status.Plugin()),
},
}
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, status)
return framework.NewStatus(status.Code()).WithError(fitErr)
}
return status
}
// Any failures after this point cannot lead to the Pod being considered unschedulable.
// We define the Pod as "unschedulable" only when Pods are rejected at specific extension points, and PreBind is the last one in the scheduling/binding cycle.
// We define the Pod as "unschedulable" only when Pods are rejected at specific extension points, and Permit is the last one in the scheduling/binding cycle.
// If a Pod fails on PreBind or Bind, it should be moved to BackoffQ for retry.
//
// We can call Done() here because
// we can free the cluster events stored in the scheduling queue sonner, which is worth for busy clusters memory consumption wise.
// we can free the cluster events stored in the scheduling queue sooner, which is worth for busy clusters memory consumption wise.
sched.SchedulingQueue.Done(assumedPod.UID)
// Run "prebind" plugins.
if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
return status
}
// Run "bind" plugins.
if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() {
return status
@ -326,7 +315,6 @@ func (sched *Scheduler) bindingCycle(
metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts))
if assumedPodInfo.InitialAttemptTimestamp != nil {
metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
}
// Run "postbind" plugins.
@ -1098,10 +1086,11 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo
msg := truncateMessage(errMsg)
fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)
if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: reason,
Message: errMsg,
Type: v1.PodScheduled,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(&pod.Status, pod.Generation, v1.PodScheduled),
Status: v1.ConditionFalse,
Reason: reason,
Message: errMsg,
}, nominatingInfo); err != nil {
logger.Error(err, "Error updating pod", "pod", klog.KObj(pod))
}

View File

@ -33,6 +33,7 @@ import (
clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
"k8s.io/klog/v2"
configv1 "k8s.io/kube-scheduler/config/v1"
"k8s.io/kubernetes/pkg/features"
@ -50,6 +51,7 @@ import (
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/profile"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
"k8s.io/utils/clock"
)
const (
@ -116,6 +118,7 @@ func (sched *Scheduler) applyDefaultHandlers() {
}
type schedulerOptions struct {
clock clock.WithTicker
componentConfigVersion string
kubeConfig *restclient.Config
// Overridden by profile level percentageOfNodesToScore if set in v1.
@ -227,6 +230,13 @@ func WithExtenders(e ...schedulerapi.Extender) Option {
}
}
// WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
func WithClock(clock clock.WithTicker) Option {
return func(o *schedulerOptions) {
o.clock = clock
}
}
// FrameworkCapturer is used for registering a notify function in building framework.
type FrameworkCapturer func(schedulerapi.KubeSchedulerProfile)
@ -238,6 +248,7 @@ func WithBuildFrameworkCapturer(fc FrameworkCapturer) Option {
}
var defaultSchedulerOptions = schedulerOptions{
clock: clock.RealClock{},
percentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore,
podInitialBackoffSeconds: int64(internalqueue.DefaultPodInitialBackoffDuration.Seconds()),
podMaxBackoffSeconds: int64(internalqueue.DefaultPodMaxBackoffDuration.Seconds()),
@ -297,11 +308,27 @@ func New(ctx context.Context,
waitingPods := frameworkruntime.NewWaitingPodsMap()
var resourceClaimCache *assumecache.AssumeCache
var resourceSliceTracker *resourceslicetracker.Tracker
var draManager framework.SharedDRAManager
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
resourceClaimInformer := informerFactory.Resource().V1beta1().ResourceClaims().Informer()
resourceClaimCache = assumecache.NewAssumeCache(logger, resourceClaimInformer, "ResourceClaim", "", nil)
draManager = dynamicresources.NewDRAManager(ctx, resourceClaimCache, informerFactory)
resourceSliceTrackerOpts := resourceslicetracker.Options{
EnableDeviceTaints: utilfeature.DefaultFeatureGate.Enabled(features.DRADeviceTaints),
SliceInformer: informerFactory.Resource().V1beta1().ResourceSlices(),
KubeClient: client,
}
// If device taints are disabled, the additional informers are not needed and
// the tracker turns into a simple wrapper around the slice informer.
if resourceSliceTrackerOpts.EnableDeviceTaints {
resourceSliceTrackerOpts.TaintInformer = informerFactory.Resource().V1alpha3().DeviceTaintRules()
resourceSliceTrackerOpts.ClassInformer = informerFactory.Resource().V1beta1().DeviceClasses()
}
resourceSliceTracker, err = resourceslicetracker.StartTracker(ctx, resourceSliceTrackerOpts)
if err != nil {
return nil, fmt.Errorf("couldn't start resource slice tracker: %w", err)
}
draManager = dynamicresources.NewDRAManager(ctx, resourceClaimCache, resourceSliceTracker, informerFactory)
}
profiles, err := profile.NewMap(ctx, options.profiles, registry, recorderFactory,
@ -343,6 +370,7 @@ func New(ctx context.Context,
podQueue := internalqueue.NewSchedulingQueue(
profiles[options.profiles[0].SchedulerName].QueueSortFunc(),
informerFactory,
internalqueue.WithClock(options.clock),
internalqueue.WithPodInitialBackoffDuration(time.Duration(options.podInitialBackoffSeconds)*time.Second),
internalqueue.WithPodMaxBackoffDuration(time.Duration(options.podMaxBackoffSeconds)*time.Second),
internalqueue.WithPodLister(podLister),
@ -378,7 +406,7 @@ func New(ctx context.Context,
sched.NextPod = podQueue.Pop
sched.applyDefaultHandlers()
if err = addAllEventHandlers(sched, informerFactory, dynInformerFactory, resourceClaimCache, unionedGVKs(queueingHintsPerProfile)); err != nil {
if err = addAllEventHandlers(sched, informerFactory, dynInformerFactory, resourceClaimCache, resourceSliceTracker, unionedGVKs(queueingHintsPerProfile)); err != nil {
return nil, fmt.Errorf("adding event handlers: %w", err)
}