rebase: update replaced k8s.io modules to v0.33.0

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-05-07 13:13:33 +02:00
committed by mergify[bot]
parent dd77e72800
commit 107407b44b
1723 changed files with 65035 additions and 175239 deletions

View File

@ -757,4 +757,12 @@ func (cache *cacheImpl) updateMetrics() {
metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
// we intentionally keep them with the deprecation and will remove at v1.34.
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
//nolint:staticcheck
metrics.SchedulerCacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
}

View File

@ -20,6 +20,7 @@ import (
"container/list"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
@ -61,14 +62,63 @@ type activeQueuer interface {
// underLock() method should be used to protect these methods.
type unlockedActiveQueuer interface {
unlockedActiveQueueReader
AddOrUpdate(pInfo *framework.QueuedPodInfo)
// add adds a new pod to the activeQ.
// The event should show which event triggered this addition and is used for the metric recording.
// This method should be called in activeQueue.underLock().
add(pInfo *framework.QueuedPodInfo, event string)
}
// unlockedActiveQueueReader defines activeQ read-only methods that are not protected by the lock itself.
// underLock() or underRLock() method should be used to protect these methods.
type unlockedActiveQueueReader interface {
Get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
Has(pInfo *framework.QueuedPodInfo) bool
// get returns the pod matching pInfo inside the activeQ.
// Returns false if the pInfo doesn't exist in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
// has returns if pInfo exists in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
has(pInfo *framework.QueuedPodInfo) bool
}
// unlockedActiveQueue defines activeQ methods that are not protected by the lock itself.
// activeQueue.underLock() or activeQueue.underRLock() method should be used to protect these methods.
type unlockedActiveQueue struct {
queue *heap.Heap[*framework.QueuedPodInfo]
}
func newUnlockedActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo]) *unlockedActiveQueue {
return &unlockedActiveQueue{
queue: queue,
}
}
// add adds a new pod to the activeQ.
// The event should show which event triggered this addition and is used for the metric recording.
// This method should be called in activeQueue.underLock().
func (uaq *unlockedActiveQueue) add(pInfo *framework.QueuedPodInfo, event string) {
uaq.queue.AddOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
}
// get returns the pod matching pInfo inside the activeQ.
// Returns false if the pInfo doesn't exist in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
func (uaq *unlockedActiveQueue) get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
return uaq.queue.Get(pInfo)
}
// has returns if pInfo exists in the queue.
// This method should be called in activeQueue.underLock() or activeQueue.underRLock().
func (uaq *unlockedActiveQueue) has(pInfo *framework.QueuedPodInfo) bool {
return uaq.queue.Has(pInfo)
}
// backoffQPopper defines method that is used to pop from the backoffQ when the activeQ is empty.
type backoffQPopper interface {
// popBackoff pops the pInfo from the podBackoffQ.
popBackoff() (*framework.QueuedPodInfo, error)
// len returns length of the podBackoffQ queue.
lenBackoff() int
}
// activeQueue implements activeQueuer. All of the fields have to be protected using the lock.
@ -77,15 +127,21 @@ type activeQueue struct {
// It protects activeQ, inFlightPods, inFlightEvents, schedulingCycle and closed fields.
// Caution: DO NOT take "SchedulingQueue.lock" after taking "lock".
// You should always take "SchedulingQueue.lock" first, otherwise the queue could end up in deadlock.
// "lock" should not be taken after taking "nLock".
// Correct locking order is: SchedulingQueue.lock > lock > nominator.nLock.
// "lock" should not be taken after taking "backoffQueue.lock" or "nominator.nLock".
// Correct locking order is: SchedulingQueue.lock > lock > backoffQueue.lock > nominator.nLock.
lock sync.RWMutex
// activeQ is heap structure that scheduler actively looks at to find pods to
// schedule. Head of heap is the highest priority pod.
queue *heap.Heap[*framework.QueuedPodInfo]
// unlockedQueue is a wrapper of queue providing methods that are not locked themselves
// and can be used in the underLock() or underRLock().
unlockedQueue *unlockedActiveQueue
// cond is a condition that is notified when the pod is added to activeQ.
// When SchedulerPopFromBackoffQ feature is enabled,
// condition is also notified when the pod is added to backoffQ.
// It is used with lock.
cond sync.Cond
@ -125,15 +181,21 @@ type activeQueue struct {
isSchedulingQueueHintEnabled bool
metricsRecorder metrics.MetricAsyncRecorder
// backoffQPopper is used to pop from backoffQ when activeQ is empty.
// It is non-nil only when SchedulerPopFromBackoffQ feature is enabled.
backoffQPopper backoffQPopper
}
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder) *activeQueue {
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder, backoffQPopper backoffQPopper) *activeQueue {
aq := &activeQueue{
queue: queue,
inFlightPods: make(map[types.UID]*list.Element),
inFlightEvents: list.New(),
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
metricsRecorder: metricRecorder,
unlockedQueue: newUnlockedActiveQueue(queue),
backoffQPopper: backoffQPopper,
}
aq.cond.L = &aq.lock
@ -146,7 +208,7 @@ func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueu
func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer)) {
aq.lock.Lock()
defer aq.lock.Unlock()
fn(aq.queue)
fn(aq.unlockedQueue)
}
// underLock runs the fn function under the lock.RLock.
@ -155,7 +217,7 @@ func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer))
func (aq *activeQueue) underRLock(fn func(unlockedActiveQ unlockedActiveQueueReader)) {
aq.lock.RLock()
defer aq.lock.RUnlock()
fn(aq.queue)
fn(aq.unlockedQueue)
}
// update updates the pod in activeQ if oldPodInfo is already in the queue.
@ -191,7 +253,13 @@ func (aq *activeQueue) pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
}
func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
var pInfo *framework.QueuedPodInfo
for aq.queue.Len() == 0 {
// backoffQPopper is non-nil only if SchedulerPopFromBackoffQ feature is enabled.
// In case of non-empty backoffQ, try popping from there.
if aq.backoffQPopper != nil && aq.backoffQPopper.lenBackoff() != 0 {
break
}
// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
// When Close() is called, the p.closed is set and the condition is broadcast,
// which causes this loop to continue and return from the Pop().
@ -203,9 +271,18 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo
}
pInfo, err := aq.queue.Pop()
if err != nil {
return nil, err
if aq.backoffQPopper == nil {
return nil, err
}
// Try to pop from backoffQ when activeQ is empty.
pInfo, err = aq.backoffQPopper.popBackoff()
if err != nil {
return nil, err
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", framework.PopFromBackoffQ).Inc()
}
pInfo.Attempts++
pInfo.BackoffExpiration = time.Time{}
// In flight, no concurrent events yet.
if aq.isSchedulingQueueHintEnabled {
// If the pod is already in the map, we shouldn't overwrite the inFlightPods otherwise it'd lead to a memory leak.
@ -354,6 +431,12 @@ func (aq *activeQueue) done(pod types.UID) {
aq.lock.Lock()
defer aq.lock.Unlock()
aq.unlockedDone(pod)
}
// unlockedDone is used by the activeQueue internally and doesn't take the lock itself.
// It assumes the lock is already taken outside before the method is called.
func (aq *activeQueue) unlockedDone(pod types.UID) {
inFlightPod, ok := aq.inFlightPods[pod]
if !ok {
// This Pod is already done()ed.
@ -398,15 +481,15 @@ func (aq *activeQueue) done(pod types.UID) {
// close closes the activeQueue.
func (aq *activeQueue) close() {
aq.lock.Lock()
defer aq.lock.Unlock()
// We should call done() for all in-flight pods to clean up the inFlightEvents metrics.
// It's safe even if the binding cycle running asynchronously calls done() afterwards
// done() will just be a no-op.
for pod := range aq.inFlightPods {
aq.done(pod)
aq.unlockedDone(pod)
}
aq.lock.Lock()
aq.closed = true
aq.lock.Unlock()
}
// broadcast notifies the pop() operation that new pod(s) was added to the activeQueue.

View File

@ -0,0 +1,405 @@
/*
Copyright 2025 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/backend/heap"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/utils/clock"
)
// backoffQOrderingWindowDuration is a duration of an ordering window in the podBackoffQ.
// In each window, represented as a whole second, pods are ordered by priority.
// It is the same as interval of flushing the pods from the podBackoffQ to the activeQ, to flush the whole windows there.
// This works only if PopFromBackoffQ feature is enabled.
// See the KEP-5142 (http://kep.k8s.io/5142) for rationale.
const backoffQOrderingWindowDuration = time.Second
// backoffQueuer is a wrapper for backoffQ related operations.
// Its methods that relies on the queues, take the lock inside.
type backoffQueuer interface {
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
// If the pod backoff time is in the actual ordering window, it should still be backing off.
isPodBackingoff(podInfo *framework.QueuedPodInfo) bool
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo
// podInitialBackoffDuration returns initial backoff duration that pod can get.
podInitialBackoffDuration() time.Duration
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
podMaxBackoffDuration() time.Duration
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
// and whole windows have to be flushed at one time without a visible latency.
waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{})
// add adds the pInfo to backoffQueue.
// The event should show which event triggered this addition and is used for the metric recording.
// It also ensures that pInfo is not in both queues.
add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string)
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
// It returns new pod info if updated, nil otherwise.
update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo
// delete deletes the pInfo from backoffQueue.
// It returns true if the pod was deleted.
delete(pInfo *framework.QueuedPodInfo) bool
// get returns the pInfo matching given pInfoLookup, if exists.
get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
// has inform if pInfo exists in the queue.
has(pInfo *framework.QueuedPodInfo) bool
// list returns all pods that are in the queue.
list() []*v1.Pod
// len returns length of the queue.
len() int
}
// backoffQueue implements backoffQueuer and wraps two queues inside,
// providing seamless access as if it were one queue.
type backoffQueue struct {
// lock synchronizes all operations related to backoffQ.
// It protects both podBackoffQ and podErrorBackoffQ.
// Caution: DO NOT take "SchedulingQueue.lock" or "activeQueue.lock" after taking "lock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first, otherwise the queue could end up in deadlock.
// "lock" should not be taken after taking "nominator.nLock".
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > lock > nominator.nLock.
lock sync.RWMutex
clock clock.WithTicker
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
// podErrorBackoffQ is a heap ordered by error backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podErrorBackoffQ *heap.Heap[*framework.QueuedPodInfo]
podInitialBackoff time.Duration
podMaxBackoff time.Duration
// activeQLessFn is used as an eventual less function if two backoff times are equal,
// when the SchedulerPopFromBackoffQ feature is enabled.
activeQLessFn framework.LessFunc
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
isPopFromBackoffQEnabled bool
}
func newBackoffQueue(clock clock.WithTicker, podInitialBackoffDuration time.Duration, podMaxBackoffDuration time.Duration, activeQLessFn framework.LessFunc, popFromBackoffQEnabled bool) *backoffQueue {
bq := &backoffQueue{
clock: clock,
podInitialBackoff: podInitialBackoffDuration,
podMaxBackoff: podMaxBackoffDuration,
isPopFromBackoffQEnabled: popFromBackoffQEnabled,
activeQLessFn: activeQLessFn,
}
podBackoffQLessFn := bq.lessBackoffCompleted
if popFromBackoffQEnabled {
podBackoffQLessFn = bq.lessBackoffCompletedWithPriority
}
bq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, podBackoffQLessFn, metrics.NewBackoffPodsRecorder())
bq.podErrorBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, bq.lessBackoffCompleted, metrics.NewBackoffPodsRecorder())
return bq
}
// podInitialBackoffDuration returns initial backoff duration that pod can get.
func (bq *backoffQueue) podInitialBackoffDuration() time.Duration {
return bq.podInitialBackoff
}
// podMaxBackoffDuration returns maximum backoff duration that pod can get.
func (bq *backoffQueue) podMaxBackoffDuration() time.Duration {
return bq.podMaxBackoff
}
// alignToWindow truncates the provided time to the podBackoffQ ordering window.
// It returns the lowest possible timestamp in the window.
func (bq *backoffQueue) alignToWindow(t time.Time) time.Time {
if !bq.isPopFromBackoffQEnabled {
return t
}
return t.Truncate(backoffQOrderingWindowDuration)
}
// waitUntilAlignedWithOrderingWindow waits until the time reaches a multiple of backoffQOrderingWindowDuration.
// It then runs the f function at the backoffQOrderingWindowDuration interval using a ticker.
// It's important to align the flushing time, because podBackoffQ's ordering is based on the windows
// and whole windows have to be flushed at one time without a visible latency.
func (bq *backoffQueue) waitUntilAlignedWithOrderingWindow(f func(), stopCh <-chan struct{}) {
now := bq.clock.Now()
// Wait until the time reaches the multiple of backoffQOrderingWindowDuration.
durationToNextWindow := bq.alignToWindow(now.Add(backoffQOrderingWindowDuration)).Sub(now)
timer := bq.clock.NewTimer(durationToNextWindow)
select {
case <-stopCh:
timer.Stop()
return
case <-timer.C():
}
// Run a ticker to make sure the invocations of f function
// are aligned with the backoffQ's ordering window.
ticker := bq.clock.NewTicker(backoffQOrderingWindowDuration)
for {
select {
case <-stopCh:
return
default:
}
f()
// NOTE: b/c there is no priority selection in golang
// it is possible for this to race, meaning we could
// trigger ticker.C and stopCh, and ticker.C select falls through.
// In order to mitigate we re-check stopCh at the beginning
// of every loop to prevent extra executions of f().
select {
case <-stopCh:
ticker.Stop()
return
case <-ticker.C():
}
}
}
// lessBackoffCompletedWithPriority is a less function of podBackoffQ if PopFromBackoffQ feature is enabled.
// It orders the pods in the same BackoffOrderingWindow the same as the activeQ will do to improve popping order from backoffQ when activeQ is empty.
func (bq *backoffQueue) lessBackoffCompletedWithPriority(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := bq.getBackoffTime(pInfo1)
bo2 := bq.getBackoffTime(pInfo2)
if !bo1.Equal(bo2) {
return bo1.Before(bo2)
}
// If the backoff time is the same, sort the pod in the same manner as activeQ does.
return bq.activeQLessFn(pInfo1, pInfo2)
}
// lessBackoffCompleted is a less function of podErrorBackoffQ.
func (bq *backoffQueue) lessBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := bq.getBackoffTime(pInfo1)
bo2 := bq.getBackoffTime(pInfo2)
return bo1.Before(bo2)
}
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
// If the pod backoff time is in the actual ordering window, it should still be backing off.
func (bq *backoffQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
boTime := bq.getBackoffTime(podInfo)
// Don't use After, because in case of windows equality we want to return true.
return !boTime.Before(bq.alignToWindow(bq.clock.Now()))
}
// getBackoffTime returns the time that podInfo completes backoff.
// It caches the result in podInfo.BackoffExpiration and returns this value in subsequent calls.
// The cache will be cleared when this pod is poped from the scheduling queue again (i.e., at activeQ's pop),
// because of the fact that the backoff time is calculated based on podInfo.Attempts,
// which doesn't get changed until the pod's scheduling is retried.
func (bq *backoffQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
if podInfo.Attempts == 0 {
// Don't store backoff expiration if the duration is 0
// to correctly handle isPodBackingoff, if pod should skip backoff, when it wasn't tried at all.
return time.Time{}
}
if podInfo.BackoffExpiration.IsZero() {
duration := bq.calculateBackoffDuration(podInfo)
podInfo.BackoffExpiration = bq.alignToWindow(podInfo.Timestamp.Add(duration))
}
return podInfo.BackoffExpiration
}
// calculateBackoffDuration is a helper function for calculating the backoffDuration
// based on the number of attempts the pod has made.
func (bq *backoffQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
if podInfo.Attempts == 0 {
// When the Pod hasn't experienced any scheduling attempts,
// they aren't obliged to get a backoff penalty at all.
return 0
}
duration := bq.podInitialBackoff
for i := 1; i < podInfo.Attempts; i++ {
// Use subtraction instead of addition or multiplication to avoid overflow.
if duration > bq.podMaxBackoff-duration {
return bq.podMaxBackoff
}
duration += duration
}
return duration
}
func (bq *backoffQueue) popAllBackoffCompletedWithQueue(logger klog.Logger, queue *heap.Heap[*framework.QueuedPodInfo]) []*framework.QueuedPodInfo {
var poppedPods []*framework.QueuedPodInfo
for {
pInfo, ok := queue.Peek()
if !ok || pInfo == nil {
break
}
pod := pInfo.Pod
if bq.isPodBackingoff(pInfo) {
break
}
_, err := queue.Pop()
if err != nil {
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
break
}
poppedPods = append(poppedPods, pInfo)
}
return poppedPods
}
// popAllBackoffCompleted pops all pods from podBackoffQ and podErrorBackoffQ that completed backoff.
func (bq *backoffQueue) popAllBackoffCompleted(logger klog.Logger) []*framework.QueuedPodInfo {
bq.lock.Lock()
defer bq.lock.Unlock()
// Ensure both queues are called
return append(bq.popAllBackoffCompletedWithQueue(logger, bq.podBackoffQ), bq.popAllBackoffCompletedWithQueue(logger, bq.podErrorBackoffQ)...)
}
// add adds the pInfo to backoffQueue.
// The event should show which event triggered this addition and is used for the metric recording.
// It also ensures that pInfo is not in both queues.
func (bq *backoffQueue) add(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) {
bq.lock.Lock()
defer bq.lock.Unlock()
// If pod has empty both unschedulable plugins and pending plugins,
// it means that it failed because of error and should be moved to podErrorBackoffQ.
if pInfo.UnschedulablePlugins.Len() == 0 && pInfo.PendingPlugins.Len() == 0 {
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
// Ensure the pod is not in the podBackoffQ and report the error if it happens.
err := bq.podBackoffQ.Delete(pInfo)
if err == nil {
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podBackoffQ", "pod", klog.KObj(pInfo.Pod))
return
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
return
}
bq.podBackoffQ.AddOrUpdate(pInfo)
// Ensure the pod is not in the podErrorBackoffQ and report the error if it happens.
err := bq.podErrorBackoffQ.Delete(pInfo)
if err == nil {
logger.Error(nil, "BackoffQueue add() was called with a pod that was already in the podErrorBackoffQ", "pod", klog.KObj(pInfo.Pod))
return
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
}
// update updates the pod in backoffQueue if oldPodInfo is already in the queue.
// It returns new pod info if updated, nil otherwise.
func (bq *backoffQueue) update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo {
bq.lock.Lock()
defer bq.lock.Unlock()
// If the pod is in the backoff queue, update it there.
if pInfo, exists := bq.podBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
bq.podBackoffQ.AddOrUpdate(pInfo)
return pInfo
}
// If the pod is in the error backoff queue, update it there.
if pInfo, exists := bq.podErrorBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
bq.podErrorBackoffQ.AddOrUpdate(pInfo)
return pInfo
}
return nil
}
// delete deletes the pInfo from backoffQueue.
// It returns true if the pod was deleted.
func (bq *backoffQueue) delete(pInfo *framework.QueuedPodInfo) bool {
bq.lock.Lock()
defer bq.lock.Unlock()
if bq.podBackoffQ.Delete(pInfo) == nil {
return true
}
return bq.podErrorBackoffQ.Delete(pInfo) == nil
}
// popBackoff pops the pInfo from the podBackoffQ.
// It returns error if the queue is empty.
// This doesn't pop the pods from the podErrorBackoffQ.
func (bq *backoffQueue) popBackoff() (*framework.QueuedPodInfo, error) {
bq.lock.Lock()
defer bq.lock.Unlock()
return bq.podBackoffQ.Pop()
}
// get returns the pInfo matching given pInfoLookup, if exists.
func (bq *backoffQueue) get(pInfoLookup *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool) {
bq.lock.RLock()
defer bq.lock.RUnlock()
pInfo, exists := bq.podBackoffQ.Get(pInfoLookup)
if exists {
return pInfo, true
}
return bq.podErrorBackoffQ.Get(pInfoLookup)
}
// has inform if pInfo exists in the queue.
func (bq *backoffQueue) has(pInfo *framework.QueuedPodInfo) bool {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Has(pInfo) || bq.podErrorBackoffQ.Has(pInfo)
}
// list returns all pods that are in the queue.
func (bq *backoffQueue) list() []*v1.Pod {
bq.lock.RLock()
defer bq.lock.RUnlock()
var result []*v1.Pod
for _, pInfo := range bq.podBackoffQ.List() {
result = append(result, pInfo.Pod)
}
for _, pInfo := range bq.podErrorBackoffQ.List() {
result = append(result, pInfo.Pod)
}
return result
}
// len returns length of the queue.
func (bq *backoffQueue) len() int {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Len() + bq.podErrorBackoffQ.Len()
}
// lenBackoff returns length of the podBackoffQ.
func (bq *backoffQueue) lenBackoff() int {
bq.lock.RLock()
defer bq.lock.RUnlock()
return bq.podBackoffQ.Len()
}

View File

@ -35,10 +35,10 @@ import (
type nominator struct {
// nLock synchronizes all operations related to nominator.
// It should not be used anywhere else.
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock") after taking "nLock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first,
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock" or "backoffQueue.lock") after taking "nLock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" and "backoffQueue.lock" first,
// otherwise the nominator could end up in deadlock.
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > nLock.
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock = backoffQueue.lock > nLock.
nLock sync.RWMutex
// podLister is used to verify if the given pod is alive.

View File

@ -132,6 +132,9 @@ type SchedulingQueue interface {
PendingPods() ([]*v1.Pod, string)
InFlightPods() []*v1.Pod
PodsInActiveQ() []*v1.Pod
// PodsInBackoffQ returns all the Pods in the backoffQ.
PodsInBackoffQ() []*v1.Pod
UnschedulablePods() []*v1.Pod
}
// NewSchedulingQueue initializes a priority queue as a new scheduling queue.
@ -155,24 +158,18 @@ type PriorityQueue struct {
*nominator
stop chan struct{}
clock clock.Clock
clock clock.WithTicker
// lock takes precedence and should be taken first,
// before any other locks in the queue (activeQueue.lock or nominator.nLock).
// Correct locking order is: lock > activeQueue.lock > nominator.nLock.
// before any other locks in the queue (activeQueue.lock or backoffQueue.lock or nominator.nLock).
// Correct locking order is: lock > activeQueue.lock > backoffQueue.lock > nominator.nLock.
lock sync.RWMutex
// pod initial backoff duration.
podInitialBackoffDuration time.Duration
// pod maximum backoff duration.
podMaxBackoffDuration time.Duration
// the maximum time a pod can stay in the unschedulablePods.
podMaxInUnschedulablePodsDuration time.Duration
activeQ activeQueuer
// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
// are popped from this heap before the scheduler looks at activeQ
podBackoffQ *heap.Heap[*framework.QueuedPodInfo]
activeQ activeQueuer
backoffQ backoffQueuer
// unschedulablePods holds pods that have been tried and determined unschedulable.
unschedulablePods *UnschedulablePods
// moveRequestCycle caches the sequence number of scheduling cycle when we
@ -195,6 +192,8 @@ type PriorityQueue struct {
// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
isSchedulingQueueHintEnabled bool
// isPopFromBackoffQEnabled indicates whether the feature gate SchedulerPopFromBackoffQ is enabled.
isPopFromBackoffQEnabled bool
}
// QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName.
@ -213,7 +212,7 @@ type clusterEvent struct {
}
type priorityQueueOptions struct {
clock clock.Clock
clock clock.WithTicker
podInitialBackoffDuration time.Duration
podMaxBackoffDuration time.Duration
podMaxInUnschedulablePodsDuration time.Duration
@ -228,7 +227,7 @@ type priorityQueueOptions struct {
type Option func(*priorityQueueOptions)
// WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
func WithClock(clock clock.Clock) Option {
func WithClock(clock clock.WithTicker) Option {
return func(o *priorityQueueOptions) {
o.clock = clock
}
@ -331,14 +330,14 @@ func NewPriorityQueue(
}
isSchedulingQueueHintEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints)
isPopFromBackoffQEnabled := utilfeature.DefaultFeatureGate.Enabled(features.SchedulerPopFromBackoffQ)
backoffQ := newBackoffQueue(options.clock, options.podInitialBackoffDuration, options.podMaxBackoffDuration, lessFn, isPopFromBackoffQEnabled)
pq := &PriorityQueue{
clock: options.clock,
stop: make(chan struct{}),
podInitialBackoffDuration: options.podInitialBackoffDuration,
podMaxBackoffDuration: options.podMaxBackoffDuration,
podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration,
activeQ: newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder),
backoffQ: backoffQ,
unschedulablePods: newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()),
preEnqueuePluginMap: options.preEnqueuePluginMap,
queueingHintMap: options.queueingHintMap,
@ -346,19 +345,24 @@ func NewPriorityQueue(
pluginMetricsSamplePercent: options.pluginMetricsSamplePercent,
moveRequestCycle: -1,
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
isPopFromBackoffQEnabled: isPopFromBackoffQEnabled,
}
pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder())
var backoffQPopper backoffQPopper
if isPopFromBackoffQEnabled {
backoffQPopper = backoffQ
}
pq.activeQ = newActiveQueue(heap.NewWithRecorder(podInfoKeyFunc, heap.LessFunc[*framework.QueuedPodInfo](lessFn), metrics.NewActivePodsRecorder()), isSchedulingQueueHintEnabled, options.metricsRecorder, backoffQPopper)
pq.nsLister = informerFactory.Core().V1().Namespaces().Lister()
pq.nominator = newPodNominator(options.podLister)
return pq
}
// Run starts the goroutine to pump from podBackoffQ to activeQ
// Run starts the goroutine to pump from backoffQ to activeQ
func (p *PriorityQueue) Run(logger klog.Logger) {
go wait.Until(func() {
go p.backoffQ.waitUntilAlignedWithOrderingWindow(func() {
p.flushBackoffQCompleted(logger)
}, 1.0*time.Second, p.stop)
}, p.stop)
go wait.Until(func() {
p.flushUnschedulablePodsLeftover(logger)
}, 30*time.Second, p.stop)
@ -553,25 +557,33 @@ func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.Pr
return s
}
// moveToActiveQ tries to add pod to active queue and remove it from unschedulable and backoff queues.
// It returns 2 parameters:
// 1. a boolean flag to indicate whether the pod is added successfully.
// 2. an error for the caller to act on.
// moveToActiveQ tries to add the pod to the active queue.
// If the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
// It returns a boolean flag to indicate whether the pod is added successfully.
func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
gatedBefore := pInfo.Gated
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
// If SchedulerPopFromBackoffQ feature gate is enabled,
// PreEnqueue plugins were called when the pod was added to the backoffQ.
// Don't need to repeat it here when the pod is directly moved from the backoffQ.
if !p.isPopFromBackoffQEnabled || event != framework.BackoffComplete {
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
}
added := false
p.activeQ.underLock(func(unlockedActiveQ unlockedActiveQueuer) {
if pInfo.Gated {
// Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins.
if unlockedActiveQ.Has(pInfo) {
if unlockedActiveQ.has(pInfo) {
return
}
if p.podBackoffQ.Has(pInfo) {
if p.backoffQ.has(pInfo) {
return
}
p.unschedulablePods.addOrUpdate(pInfo)
if p.unschedulablePods.get(pInfo.Pod) != nil {
return
}
p.unschedulablePods.addOrUpdate(pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue, because the pod is gated", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
return
}
if pInfo.InitialAttemptTimestamp == nil {
@ -579,13 +591,12 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
pInfo.InitialAttemptTimestamp = &now
}
unlockedActiveQ.AddOrUpdate(pInfo)
unlockedActiveQ.add(pInfo, event)
added = true
p.unschedulablePods.delete(pInfo.Pod, gatedBefore)
_ = p.podBackoffQ.Delete(pInfo) // Don't need to react when pInfo is not found.
p.backoffQ.delete(pInfo)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", activeQ)
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
if event == framework.EventUnscheduledPodAdd.Label() || event == framework.EventUnscheduledPodUpdate.Label() {
p.AddNominatedPod(logger, pInfo.PodInfo, nil)
}
@ -593,6 +604,28 @@ func (p *PriorityQueue) moveToActiveQ(logger klog.Logger, pInfo *framework.Queue
return added
}
// moveToBackoffQ tries to add the pod to the backoff queue.
// If SchedulerPopFromBackoffQ feature gate is enabled and the pod doesn't pass PreEnqueue plugins, it gets added to unschedulablePods instead.
// It returns a boolean flag to indicate whether the pod is added successfully.
func (p *PriorityQueue) moveToBackoffQ(logger klog.Logger, pInfo *framework.QueuedPodInfo, event string) bool {
// If SchedulerPopFromBackoffQ feature gate is enabled,
// PreEnqueue plugins are called on inserting pods to the backoffQ,
// not to call them again on popping out.
if p.isPopFromBackoffQEnabled {
pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
if pInfo.Gated {
if p.unschedulablePods.get(pInfo.Pod) == nil {
p.unschedulablePods.addOrUpdate(pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", unschedulablePods)
}
return false
}
}
p.backoffQ.add(logger, pInfo, event)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event, "queue", backoffQ)
return true
}
// Add adds a pod to the active queue. It should be called only when a new pod
// is added so there is no chance the pod is already in active/unschedulable/backoff queues
func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) {
@ -641,10 +674,16 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
// If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it.
// The pod can be already in activeQ.
var exists bool
pInfo, exists = p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod))
pInfo, exists = p.backoffQ.get(newQueuedPodInfoForLookup(pod))
if !exists {
return false
}
// Delete pod from the backoffQ now to make sure it won't be popped from the backoffQ
// just before moving it to the activeQ
if deleted := p.backoffQ.delete(pInfo); !deleted {
// Pod was popped from the backoffQ in the meantime. Don't activate it.
return false
}
}
if pInfo == nil {
@ -656,13 +695,6 @@ func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
return p.moveToActiveQ(logger, pInfo, framework.ForceActivate)
}
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
boTime := p.getBackoffTime(podInfo)
return boTime.After(p.clock.Now())
}
// SchedulingCycle returns current scheduling cycle.
func (p *PriorityQueue) SchedulingCycle() int64 {
return p.activeQ.schedulingCycle()
@ -712,7 +744,7 @@ func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger
// addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
// request, then the pod is put in `podBackoffQ`.
// request, then the pod is put in `backoffQ`.
// TODO: This function is called only when p.isSchedulingQueueHintEnabled is false,
// and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
@ -736,13 +768,14 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
// - No unschedulable plugins are associated with this Pod,
// meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
// In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
p.podBackoffQ.AddOrUpdate(pInfo)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", backoffQ)
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", framework.ScheduleAttemptFailure).Inc()
if added := p.moveToBackoffQ(logger, pInfo, framework.ScheduleAttemptFailure); added {
if p.isPopFromBackoffQEnabled {
p.activeQ.broadcast()
}
}
} else {
p.unschedulablePods.addOrUpdate(pInfo)
p.unschedulablePods.addOrUpdate(pInfo, framework.ScheduleAttemptFailure)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", unschedulablePods)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
}
return nil
@ -751,7 +784,7 @@ func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger,
// AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into
// the queue, unless it is already in the queue. Normally, PriorityQueue puts
// unschedulable pods in `unschedulablePods`. But if there has been a recent move
// request, then the pod is put in `podBackoffQ`.
// request, then the pod is put in `backoffQ`.
func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
p.lock.Lock()
defer p.lock.Unlock()
@ -767,7 +800,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
if p.activeQ.has(pInfo) {
return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
}
if p.podBackoffQ.Has(pInfo) {
if p.backoffQ.has(pInfo) {
return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
}
@ -792,7 +825,7 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *
// In this case, we try to requeue this Pod to activeQ/backoffQ.
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, framework.ScheduleAttemptFailure)
logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", framework.ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins)
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
// When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out.
p.activeQ.broadcast()
}
@ -805,25 +838,12 @@ func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) {
p.lock.Lock()
defer p.lock.Unlock()
activated := false
for {
pInfo, ok := p.podBackoffQ.Peek()
if !ok || pInfo == nil {
break
}
pod := pInfo.Pod
if p.isPodBackingoff(pInfo) {
break
}
_, err := p.podBackoffQ.Pop()
if err != nil {
logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
break
}
podsCompletedBackoff := p.backoffQ.popAllBackoffCompleted(logger)
for _, pInfo := range podsCompletedBackoff {
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
activated = true
}
}
if activated {
p.activeQ.broadcast()
}
@ -928,10 +948,8 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
}
// If the pod is in the backoff queue, update it there.
if pInfo, exists := p.podBackoffQ.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
if pInfo := p.backoffQ.update(newPod, oldPodInfo); pInfo != nil {
p.UpdateNominatedPod(logger, oldPod, pInfo.PodInfo)
p.podBackoffQ.AddOrUpdate(pInfo)
return
}
}
@ -953,7 +971,7 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
logger.V(5).Info("Pod moved to an internal scheduling queue because the Pod is updated", "pod", klog.KObj(newPod), "event", evt.Label(), "queue", queue)
p.unschedulablePods.delete(pInfo.Pod, gated)
}
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
p.activeQ.broadcast()
break
}
@ -961,21 +979,26 @@ func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) {
return
}
if isPodUpdated(oldPod, newPod) {
if p.isPodBackingoff(pInfo) {
p.podBackoffQ.AddOrUpdate(pInfo)
p.unschedulablePods.delete(pInfo.Pod, gated)
logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", framework.EventUnscheduledPodUpdate.Label(), "queue", backoffQ)
// Pod might have completed its backoff time while being in unschedulablePods,
// so we should check isPodBackingoff before moving the pod to backoffQ.
if p.backoffQ.isPodBackingoff(pInfo) {
if added := p.moveToBackoffQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
p.unschedulablePods.delete(pInfo.Pod, gated)
if p.isPopFromBackoffQEnabled {
p.activeQ.broadcast()
}
}
return
}
if added := p.moveToActiveQ(logger, pInfo, framework.BackoffComplete); added {
if added := p.moveToActiveQ(logger, pInfo, framework.EventUnscheduledPodUpdate.Label()); added {
p.activeQ.broadcast()
}
return
}
// Pod update didn't make it schedulable, keep it in the unschedulable queue.
p.unschedulablePods.addOrUpdate(pInfo)
p.unschedulablePods.addOrUpdate(pInfo, framework.EventUnscheduledPodUpdate.Label())
return
}
// If pod is not in any of the queues, we put it in the active queue.
@ -992,12 +1015,14 @@ func (p *PriorityQueue) Delete(pod *v1.Pod) {
defer p.lock.Unlock()
p.DeleteNominatedPodIfExists(pod)
pInfo := newQueuedPodInfoForLookup(pod)
if err := p.activeQ.delete(pInfo); err != nil {
// The item was probably not found in the activeQ.
p.podBackoffQ.Delete(pInfo)
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
p.unschedulablePods.delete(pod, pInfo.Gated)
}
if err := p.activeQ.delete(pInfo); err == nil {
return
}
if deleted := p.backoffQ.delete(pInfo); deleted {
return
}
if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
p.unschedulablePods.delete(pod, pInfo.Gated)
}
}
@ -1065,28 +1090,24 @@ func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event
// NOTE: this function assumes lock has been acquired in caller
func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string {
if strategy == queueSkip {
p.unschedulablePods.addOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
p.unschedulablePods.addOrUpdate(pInfo, event)
return unschedulablePods
}
if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) {
p.podBackoffQ.AddOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
return backoffQ
// Pod might have completed its backoff time while being in unschedulablePods,
// so we should check isPodBackingoff before moving the pod to backoffQ.
if strategy == queueAfterBackoff && p.backoffQ.isPodBackingoff(pInfo) {
if added := p.moveToBackoffQ(logger, pInfo, event); added {
return backoffQ
}
return unschedulablePods
}
// Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off.
if added := p.moveToActiveQ(logger, pInfo, event); added {
return activeQ
}
if pInfo.Gated {
// In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in moveToActiveQ.
return unschedulablePods
}
p.unschedulablePods.addOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", framework.ScheduleAttemptFailure).Inc()
// Pod is gated. We don't have to push it back to unschedulable queue, because moveToActiveQ should already have done that.
return unschedulablePods
}
@ -1128,7 +1149,7 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn
p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated)
queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label())
logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label(), "queue", queue, "hint", schedulingHint)
if queue == activeQ {
if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) {
activated = true
}
}
@ -1180,6 +1201,20 @@ func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
return p.activeQ.list()
}
// PodsInBackoffQ returns all the Pods in the backoffQ.
func (p *PriorityQueue) PodsInBackoffQ() []*v1.Pod {
return p.backoffQ.list()
}
// UnschedulablePods returns all the pods in unschedulable state.
func (p *PriorityQueue) UnschedulablePods() []*v1.Pod {
var result []*v1.Pod
for _, pInfo := range p.unschedulablePods.podInfoMap {
result = append(result, pInfo.Pod)
}
return result
}
var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
// GetPod searches for a pod in the activeQ, backoffQ, and unschedulablePods.
@ -1197,7 +1232,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
},
},
}
if pInfo, ok = p.podBackoffQ.Get(pInfoLookup); ok {
if pInfo, ok = p.backoffQ.get(pInfoLookup); ok {
return pInfo, true
}
if pInfo = p.unschedulablePods.get(pInfoLookup.Pod); pInfo != nil {
@ -1205,7 +1240,7 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
}
p.activeQ.underRLock(func(unlockedActiveQ unlockedActiveQueueReader) {
pInfo, ok = unlockedActiveQ.Get(pInfoLookup)
pInfo, ok = unlockedActiveQ.get(pInfoLookup)
})
return
}
@ -1216,15 +1251,15 @@ func (p *PriorityQueue) GetPod(name, namespace string) (pInfo *framework.QueuedP
func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) {
p.lock.RLock()
defer p.lock.RUnlock()
result := p.activeQ.list()
result := p.PodsInActiveQ()
activeQLen := len(result)
for _, pInfo := range p.podBackoffQ.List() {
result = append(result, pInfo.Pod)
}
backoffQPods := p.PodsInBackoffQ()
backoffQLen := len(backoffQPods)
result = append(result, backoffQPods...)
for _, pInfo := range p.unschedulablePods.podInfoMap {
result = append(result, pInfo.Pod)
}
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap))
return result, fmt.Sprintf(pendingPodsSummary, activeQLen, backoffQLen, len(p.unschedulablePods.podInfoMap))
}
// Note: this function assumes the caller locks both p.lock.RLock and p.activeQ.getLock().RLock.
@ -1232,7 +1267,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
pod := np.toPod()
pInfoLookup := newQueuedPodInfoForLookup(pod)
queuedPodInfo, exists := unlockedActiveQ.Get(pInfoLookup)
queuedPodInfo, exists := unlockedActiveQ.get(pInfoLookup)
if exists {
return queuedPodInfo.PodInfo
}
@ -1242,7 +1277,7 @@ func (p *PriorityQueue) nominatedPodToInfo(np podRef, unlockedActiveQ unlockedAc
return queuedPodInfo.PodInfo
}
queuedPodInfo, exists = p.podBackoffQ.Get(pInfoLookup)
queuedPodInfo, exists = p.backoffQ.get(pInfoLookup)
if exists {
return queuedPodInfo.PodInfo
}
@ -1276,12 +1311,6 @@ func (p *PriorityQueue) NominatedPodsForNode(nodeName string) []*framework.PodIn
return pods
}
func (p *PriorityQueue) podsCompareBackoffCompleted(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
bo1 := p.getBackoffTime(pInfo1)
bo2 := p.getBackoffTime(pInfo2)
return bo1.Before(bo2)
}
// newQueuedPodInfo builds a QueuedPodInfo object.
func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
now := p.clock.Now()
@ -1296,33 +1325,6 @@ func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framew
}
}
// getBackoffTime returns the time that podInfo completes backoff
func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
duration := p.calculateBackoffDuration(podInfo)
backoffTime := podInfo.Timestamp.Add(duration)
return backoffTime
}
// calculateBackoffDuration is a helper function for calculating the backoffDuration
// based on the number of attempts the pod has made.
func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
if podInfo.Attempts == 0 {
// When the Pod hasn't experienced any scheduling attempts,
// they aren't obliged to get a backoff penalty at all.
return 0
}
duration := p.podInitialBackoffDuration
for i := 1; i < podInfo.Attempts; i++ {
// Use subtraction instead of addition or multiplication to avoid overflow.
if duration > p.podMaxBackoffDuration-duration {
return p.podMaxBackoffDuration
}
duration += duration
}
return duration
}
// UnschedulablePods holds pods that cannot be scheduled. This data structure
// is used to implement unschedulablePods.
type UnschedulablePods struct {
@ -1335,7 +1337,8 @@ type UnschedulablePods struct {
}
// addOrUpdate adds a pod to the unschedulable podInfoMap.
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
// The event should show which event triggered the addition and is used for the metric recording.
func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo, event string) {
podID := u.keyFunc(pInfo.Pod)
if _, exists := u.podInfoMap[podID]; !exists {
if pInfo.Gated && u.gatedRecorder != nil {
@ -1343,6 +1346,7 @@ func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
} else if !pInfo.Gated && u.unschedulableRecorder != nil {
u.unschedulableRecorder.Inc()
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
}
u.podInfoMap[podID] = pInfo
}