mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-01-26 06:39:30 +00:00
498 lines
19 KiB
Go
498 lines
19 KiB
Go
/*
|
|
Copyright 2014 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
clientset "k8s.io/client-go/kubernetes"
|
|
corelisters "k8s.io/client-go/listers/core/v1"
|
|
"k8s.io/client-go/tools/record"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithm"
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
|
|
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
|
"k8s.io/kubernetes/pkg/scheduler/core"
|
|
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
|
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
|
|
"k8s.io/kubernetes/pkg/scheduler/util"
|
|
"k8s.io/kubernetes/pkg/scheduler/volumebinder"
|
|
|
|
"github.com/golang/glog"
|
|
)
|
|
|
|
// Binder knows how to write a binding.
|
|
type Binder interface {
|
|
Bind(binding *v1.Binding) error
|
|
}
|
|
|
|
// PodConditionUpdater updates the condition of a pod based on the passed
|
|
// PodCondition
|
|
type PodConditionUpdater interface {
|
|
Update(pod *v1.Pod, podCondition *v1.PodCondition) error
|
|
}
|
|
|
|
// PodPreemptor has methods needed to delete a pod and to update
|
|
// annotations of the preemptor pod.
|
|
type PodPreemptor interface {
|
|
GetUpdatedPod(pod *v1.Pod) (*v1.Pod, error)
|
|
DeletePod(pod *v1.Pod) error
|
|
SetNominatedNodeName(pod *v1.Pod, nominatedNode string) error
|
|
RemoveNominatedNodeName(pod *v1.Pod) error
|
|
}
|
|
|
|
// Scheduler watches for new unscheduled pods. It attempts to find
|
|
// nodes that they fit on and writes bindings back to the api server.
|
|
type Scheduler struct {
|
|
config *Config
|
|
}
|
|
|
|
// StopEverything closes the scheduler config's StopEverything channel, to shut
|
|
// down the Scheduler.
|
|
func (sched *Scheduler) StopEverything() {
|
|
close(sched.config.StopEverything)
|
|
}
|
|
|
|
// Configurator defines I/O, caching, and other functionality needed to
|
|
// construct a new scheduler. An implementation of this can be seen in
|
|
// factory.go.
|
|
type Configurator interface {
|
|
GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error)
|
|
GetPriorityMetadataProducer() (algorithm.PriorityMetadataProducer, error)
|
|
GetPredicateMetadataProducer() (algorithm.PredicateMetadataProducer, error)
|
|
GetPredicates(predicateKeys sets.String) (map[string]algorithm.FitPredicate, error)
|
|
GetHardPodAffinitySymmetricWeight() int32
|
|
GetSchedulerName() string
|
|
MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue core.SchedulingQueue) func(pod *v1.Pod, err error)
|
|
|
|
// Needs to be exposed for things like integration tests where we want to make fake nodes.
|
|
GetNodeLister() corelisters.NodeLister
|
|
GetClient() clientset.Interface
|
|
GetScheduledPodLister() corelisters.PodLister
|
|
|
|
Create() (*Config, error)
|
|
CreateFromProvider(providerName string) (*Config, error)
|
|
CreateFromConfig(policy schedulerapi.Policy) (*Config, error)
|
|
CreateFromKeys(predicateKeys, priorityKeys sets.String, extenders []algorithm.SchedulerExtender) (*Config, error)
|
|
}
|
|
|
|
// Config is an implementation of the Scheduler's configured input data.
|
|
// TODO over time we should make this struct a hidden implementation detail of the scheduler.
|
|
type Config struct {
|
|
// It is expected that changes made via SchedulerCache will be observed
|
|
// by NodeLister and Algorithm.
|
|
SchedulerCache schedulercache.Cache
|
|
// Ecache is used for optimistically invalid affected cache items after
|
|
// successfully binding a pod
|
|
Ecache *core.EquivalenceCache
|
|
NodeLister algorithm.NodeLister
|
|
Algorithm algorithm.ScheduleAlgorithm
|
|
GetBinder func(pod *v1.Pod) Binder
|
|
// PodConditionUpdater is used only in case of scheduling errors. If we succeed
|
|
// with scheduling, PodScheduled condition will be updated in apiserver in /bind
|
|
// handler so that binding and setting PodCondition it is atomic.
|
|
PodConditionUpdater PodConditionUpdater
|
|
// PodPreemptor is used to evict pods and update pod annotations.
|
|
PodPreemptor PodPreemptor
|
|
|
|
// NextPod should be a function that blocks until the next pod
|
|
// is available. We don't use a channel for this, because scheduling
|
|
// a pod may take some amount of time and we don't want pods to get
|
|
// stale while they sit in a channel.
|
|
NextPod func() *v1.Pod
|
|
|
|
// WaitForCacheSync waits for scheduler cache to populate.
|
|
// It returns true if it was successful, false if the controller should shutdown.
|
|
WaitForCacheSync func() bool
|
|
|
|
// Error is called if there is an error. It is passed the pod in
|
|
// question, and the error
|
|
Error func(*v1.Pod, error)
|
|
|
|
// Recorder is the EventRecorder to use
|
|
Recorder record.EventRecorder
|
|
|
|
// Close this to shut down the scheduler.
|
|
StopEverything chan struct{}
|
|
|
|
// VolumeBinder handles PVC/PV binding for the pod.
|
|
VolumeBinder *volumebinder.VolumeBinder
|
|
}
|
|
|
|
// NewFromConfigurator returns a new scheduler that is created entirely by the Configurator. Assumes Create() is implemented.
|
|
// Supports intermediate Config mutation for now if you provide modifier functions which will run after Config is created.
|
|
func NewFromConfigurator(c Configurator, modifiers ...func(c *Config)) (*Scheduler, error) {
|
|
cfg, err := c.Create()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Mutate it if any functions were provided, changes might be required for certain types of tests (i.e. change the recorder).
|
|
for _, modifier := range modifiers {
|
|
modifier(cfg)
|
|
}
|
|
// From this point on the config is immutable to the outside.
|
|
s := &Scheduler{
|
|
config: cfg,
|
|
}
|
|
metrics.Register()
|
|
return s, nil
|
|
}
|
|
|
|
// NewFromConfig returns a new scheduler using the provided Config.
|
|
func NewFromConfig(config *Config) *Scheduler {
|
|
metrics.Register()
|
|
return &Scheduler{
|
|
config: config,
|
|
}
|
|
}
|
|
|
|
// Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
|
|
func (sched *Scheduler) Run() {
|
|
if !sched.config.WaitForCacheSync() {
|
|
return
|
|
}
|
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
|
|
go sched.config.VolumeBinder.Run(sched.bindVolumesWorker, sched.config.StopEverything)
|
|
}
|
|
|
|
go wait.Until(sched.scheduleOne, 0, sched.config.StopEverything)
|
|
}
|
|
|
|
// Config return scheduler's config pointer. It is exposed for testing purposes.
|
|
func (sched *Scheduler) Config() *Config {
|
|
return sched.config
|
|
}
|
|
|
|
// schedule implements the scheduling algorithm and returns the suggested host.
|
|
func (sched *Scheduler) schedule(pod *v1.Pod) (string, error) {
|
|
host, err := sched.config.Algorithm.Schedule(pod, sched.config.NodeLister)
|
|
if err != nil {
|
|
glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name)
|
|
pod = pod.DeepCopy()
|
|
sched.config.Error(pod, err)
|
|
sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
|
|
sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: v1.PodReasonUnschedulable,
|
|
Message: err.Error(),
|
|
})
|
|
return "", err
|
|
}
|
|
return host, err
|
|
}
|
|
|
|
// preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible.
|
|
// If it succeeds, it adds the name of the node where preemption has happened to the pod annotations.
|
|
// It returns the node name and an error if any.
|
|
func (sched *Scheduler) preempt(preemptor *v1.Pod, scheduleErr error) (string, error) {
|
|
if !util.PodPriorityEnabled() {
|
|
glog.V(3).Infof("Pod priority feature is not enabled. No preemption is performed.")
|
|
return "", nil
|
|
}
|
|
preemptor, err := sched.config.PodPreemptor.GetUpdatedPod(preemptor)
|
|
if err != nil {
|
|
glog.Errorf("Error getting the updated preemptor pod object: %v", err)
|
|
return "", err
|
|
}
|
|
|
|
node, victims, nominatedPodsToClear, err := sched.config.Algorithm.Preempt(preemptor, sched.config.NodeLister, scheduleErr)
|
|
metrics.PreemptionVictims.Set(float64(len(victims)))
|
|
if err != nil {
|
|
glog.Errorf("Error preempting victims to make room for %v/%v.", preemptor.Namespace, preemptor.Name)
|
|
return "", err
|
|
}
|
|
var nodeName = ""
|
|
if node != nil {
|
|
nodeName = node.Name
|
|
err = sched.config.PodPreemptor.SetNominatedNodeName(preemptor, nodeName)
|
|
if err != nil {
|
|
glog.Errorf("Error in preemption process. Cannot update pod %v annotations: %v", preemptor.Name, err)
|
|
return "", err
|
|
}
|
|
for _, victim := range victims {
|
|
if err := sched.config.PodPreemptor.DeletePod(victim); err != nil {
|
|
glog.Errorf("Error preempting pod %v/%v: %v", victim.Namespace, victim.Name, err)
|
|
return "", err
|
|
}
|
|
sched.config.Recorder.Eventf(victim, v1.EventTypeNormal, "Preempted", "by %v/%v on node %v", preemptor.Namespace, preemptor.Name, nodeName)
|
|
}
|
|
}
|
|
// Clearing nominated pods should happen outside of "if node != nil". Node could
|
|
// be nil when a pod with nominated node name is eligible to preempt again,
|
|
// but preemption logic does not find any node for it. In that case Preempt()
|
|
// function of generic_scheduler.go returns the pod itself for removal of the annotation.
|
|
for _, p := range nominatedPodsToClear {
|
|
rErr := sched.config.PodPreemptor.RemoveNominatedNodeName(p)
|
|
if rErr != nil {
|
|
glog.Errorf("Cannot remove nominated node annotation of pod: %v", rErr)
|
|
// We do not return as this error is not critical.
|
|
}
|
|
}
|
|
return nodeName, err
|
|
}
|
|
|
|
// assumeAndBindVolumes will update the volume cache and then asynchronously bind volumes if required.
|
|
//
|
|
// If volume binding is required, then the bind volumes routine will update the pod to send it back through
|
|
// the scheduler.
|
|
//
|
|
// Otherwise, return nil error and continue to assume the pod.
|
|
//
|
|
// This function modifies assumed if volume binding is required.
|
|
func (sched *Scheduler) assumeAndBindVolumes(assumed *v1.Pod, host string) error {
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.VolumeScheduling) {
|
|
allBound, bindingRequired, err := sched.config.VolumeBinder.Binder.AssumePodVolumes(assumed, host)
|
|
if err != nil {
|
|
sched.config.Error(assumed, err)
|
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePodVolumes failed: %v", err)
|
|
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: "SchedulerError",
|
|
Message: err.Error(),
|
|
})
|
|
return err
|
|
}
|
|
if !allBound {
|
|
err = fmt.Errorf("Volume binding started, waiting for completion")
|
|
if bindingRequired {
|
|
if sched.config.Ecache != nil {
|
|
invalidPredicates := sets.NewString(predicates.CheckVolumeBindingPred)
|
|
sched.config.Ecache.InvalidateCachedPredicateItemOfAllNodes(invalidPredicates)
|
|
}
|
|
|
|
// bindVolumesWorker() will update the Pod object to put it back in the scheduler queue
|
|
sched.config.VolumeBinder.BindQueue.Add(assumed)
|
|
} else {
|
|
// We are just waiting for PV controller to finish binding, put it back in the
|
|
// scheduler queue
|
|
sched.config.Error(assumed, err)
|
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "FailedScheduling", "%v", err)
|
|
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: "VolumeBindingWaiting",
|
|
})
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// bindVolumesWorker() processes pods queued in assumeAndBindVolumes() and tries to
|
|
// make the API update for volume binding.
|
|
// This function runs forever until the volume BindQueue is closed.
|
|
func (sched *Scheduler) bindVolumesWorker() {
|
|
workFunc := func() bool {
|
|
keyObj, quit := sched.config.VolumeBinder.BindQueue.Get()
|
|
if quit {
|
|
return true
|
|
}
|
|
defer sched.config.VolumeBinder.BindQueue.Done(keyObj)
|
|
|
|
assumed, ok := keyObj.(*v1.Pod)
|
|
if !ok {
|
|
glog.V(4).Infof("Object is not a *v1.Pod")
|
|
return false
|
|
}
|
|
|
|
// TODO: add metrics
|
|
var reason string
|
|
var eventType string
|
|
|
|
glog.V(5).Infof("Trying to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
|
|
|
|
// The Pod is always sent back to the scheduler afterwards.
|
|
err := sched.config.VolumeBinder.Binder.BindPodVolumes(assumed)
|
|
if err != nil {
|
|
glog.V(1).Infof("Failed to bind volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name, err)
|
|
reason = "VolumeBindingFailed"
|
|
eventType = v1.EventTypeWarning
|
|
} else {
|
|
glog.V(4).Infof("Successfully bound volumes for pod \"%v/%v\"", assumed.Namespace, assumed.Name)
|
|
reason = "VolumeBindingWaiting"
|
|
eventType = v1.EventTypeNormal
|
|
err = fmt.Errorf("Volume binding started, waiting for completion")
|
|
}
|
|
|
|
// Always fail scheduling regardless of binding success.
|
|
// The Pod needs to be sent back through the scheduler to:
|
|
// * Retry volume binding if it fails.
|
|
// * Retry volume binding if dynamic provisioning fails.
|
|
// * Bind the Pod to the Node once all volumes are bound.
|
|
sched.config.Error(assumed, err)
|
|
sched.config.Recorder.Eventf(assumed, eventType, "FailedScheduling", "%v", err)
|
|
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: reason,
|
|
})
|
|
return false
|
|
}
|
|
|
|
for {
|
|
if quit := workFunc(); quit {
|
|
glog.V(4).Infof("bindVolumesWorker shutting down")
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
|
|
// assume modifies `assumed`.
|
|
func (sched *Scheduler) assume(assumed *v1.Pod, host string) error {
|
|
// Optimistically assume that the binding will succeed and send it to apiserver
|
|
// in the background.
|
|
// If the binding fails, scheduler will release resources allocated to assumed pod
|
|
// immediately.
|
|
assumed.Spec.NodeName = host
|
|
if err := sched.config.SchedulerCache.AssumePod(assumed); err != nil {
|
|
glog.Errorf("scheduler cache AssumePod failed: %v", err)
|
|
|
|
// This is most probably result of a BUG in retrying logic.
|
|
// We report an error here so that pod scheduling can be retried.
|
|
// This relies on the fact that Error will check if the pod has been bound
|
|
// to a node and if so will not add it back to the unscheduled pods queue
|
|
// (otherwise this would cause an infinite loop).
|
|
sched.config.Error(assumed, err)
|
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "AssumePod failed: %v", err)
|
|
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: "SchedulerError",
|
|
Message: err.Error(),
|
|
})
|
|
return err
|
|
}
|
|
|
|
// Optimistically assume that the binding will succeed, so we need to invalidate affected
|
|
// predicates in equivalence cache.
|
|
// If the binding fails, these invalidated item will not break anything.
|
|
if sched.config.Ecache != nil {
|
|
sched.config.Ecache.InvalidateCachedPredicateItemForPodAdd(assumed, host)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// bind binds a pod to a given node defined in a binding object. We expect this to run asynchronously, so we
|
|
// handle binding metrics internally.
|
|
func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
|
|
bindingStart := time.Now()
|
|
// If binding succeeded then PodScheduled condition will be updated in apiserver so that
|
|
// it's atomic with setting host.
|
|
err := sched.config.GetBinder(assumed).Bind(b)
|
|
if err := sched.config.SchedulerCache.FinishBinding(assumed); err != nil {
|
|
glog.Errorf("scheduler cache FinishBinding failed: %v", err)
|
|
}
|
|
if err != nil {
|
|
glog.V(1).Infof("Failed to bind pod: %v/%v", assumed.Namespace, assumed.Name)
|
|
if err := sched.config.SchedulerCache.ForgetPod(assumed); err != nil {
|
|
glog.Errorf("scheduler cache ForgetPod failed: %v", err)
|
|
}
|
|
sched.config.Error(assumed, err)
|
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeWarning, "FailedScheduling", "Binding rejected: %v", err)
|
|
sched.config.PodConditionUpdater.Update(assumed, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionFalse,
|
|
Reason: "BindingRejected",
|
|
})
|
|
return err
|
|
}
|
|
|
|
metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
|
|
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", assumed.Name, b.Target.Name)
|
|
return nil
|
|
}
|
|
|
|
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
|
|
func (sched *Scheduler) scheduleOne() {
|
|
pod := sched.config.NextPod()
|
|
if pod.DeletionTimestamp != nil {
|
|
sched.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
|
|
glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
|
|
return
|
|
}
|
|
|
|
glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)
|
|
|
|
// Synchronously attempt to find a fit for the pod.
|
|
start := time.Now()
|
|
suggestedHost, err := sched.schedule(pod)
|
|
if err != nil {
|
|
// schedule() may have failed because the pod would not fit on any host, so we try to
|
|
// preempt, with the expectation that the next time the pod is tried for scheduling it
|
|
// will fit due to the preemption. It is also possible that a different pod will schedule
|
|
// into the resources that were preempted, but this is harmless.
|
|
if fitError, ok := err.(*core.FitError); ok {
|
|
preemptionStartTime := time.Now()
|
|
sched.preempt(pod, fitError)
|
|
metrics.PreemptionAttempts.Inc()
|
|
metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime))
|
|
}
|
|
return
|
|
}
|
|
metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
|
|
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
|
|
// This allows us to keep scheduling without waiting on binding to occur.
|
|
assumedPod := pod.DeepCopy()
|
|
|
|
// Assume volumes first before assuming the pod.
|
|
//
|
|
// If no volumes need binding, then nil is returned, and continue to assume the pod.
|
|
//
|
|
// Otherwise, error is returned and volume binding is started asynchronously for all of the pod's volumes.
|
|
// scheduleOne() returns immediately on error, so that it doesn't continue to assume the pod.
|
|
//
|
|
// After the asynchronous volume binding updates are made, it will send the pod back through the scheduler for
|
|
// subsequent passes until all volumes are fully bound.
|
|
//
|
|
// This function modifies 'assumedPod' if volume binding is required.
|
|
err = sched.assumeAndBindVolumes(assumedPod, suggestedHost)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// assume modifies `assumedPod` by setting NodeName=suggestedHost
|
|
err = sched.assume(assumedPod, suggestedHost)
|
|
if err != nil {
|
|
return
|
|
}
|
|
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
|
|
go func() {
|
|
err := sched.bind(assumedPod, &v1.Binding{
|
|
ObjectMeta: metav1.ObjectMeta{Namespace: assumedPod.Namespace, Name: assumedPod.Name, UID: assumedPod.UID},
|
|
Target: v1.ObjectReference{
|
|
Kind: "Node",
|
|
Name: suggestedHost,
|
|
},
|
|
})
|
|
metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
|
|
if err != nil {
|
|
glog.Errorf("Internal error binding pod: (%v)", err)
|
|
}
|
|
}()
|
|
}
|