rebase: update kubernetes to v1.20.0

updated kubernetes packages to latest
release.

Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
This commit is contained in:
Madhu Rajanna
2020-12-17 17:58:29 +05:30
committed by mergify[bot]
parent 4abe128bd8
commit 83559144b1
1624 changed files with 247222 additions and 160270 deletions

View File

@ -1,16 +0,0 @@
{
"Rules": [
{
"SelectorRegexp": "k8s[.]io/kubernetes/pkg/client/unversioned$",
"ForbiddenPrefixes": [
"k8s.io/kubernetes/pkg/client/unversioned"
]
},
{
"SelectorRegexp": "k8s[.]io/kubernetes/pkg/client/unversioned/testclient$",
"ForbiddenPrefixes": [
"k8s.io/kubernetes/pkg/client/unversioned/testclient"
]
}
]
}

View File

@ -1,9 +1,53 @@
package(default_visibility = ["//visibility:public"])
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
"go_test",
go_library(
name = "go_default_library",
srcs = [
"client_builder_dynamic.go",
"controller_ref_manager.go",
"controller_utils.go",
"doc.go",
"lookup_cache.go",
],
importpath = "k8s.io/kubernetes/pkg/controller",
visibility = ["//visibility:public"],
deps = [
"//pkg/api/v1/pod:go_default_library",
"//pkg/apis/core/install:go_default_library",
"//pkg/apis/core/validation:go_default_library",
"//pkg/util/hash:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/authentication/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/meta:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/strategicpatch:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/authentication/serviceaccount:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/rest:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//staging/src/k8s.io/client-go/transport:go_default_library",
"//staging/src/k8s.io/client-go/util/retry:go_default_library",
"//staging/src/k8s.io/controller-manager/pkg/clientbuilder:go_default_library",
"//vendor/github.com/golang/groupcache/lru:go_default_library",
"//vendor/golang.org/x/oauth2:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
"//vendor/k8s.io/utils/integer:go_default_library",
"//vendor/k8s.io/utils/pointer:go_default_library",
],
)
go_test(
@ -20,6 +64,7 @@ go_test(
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
@ -39,65 +84,6 @@ go_test(
],
)
go_library(
name = "go_default_library",
srcs = [
"client_builder.go",
"client_builder_dynamic.go",
"controller_ref_manager.go",
"controller_utils.go",
"doc.go",
"informer_factory.go",
"lookup_cache.go",
],
importpath = "k8s.io/kubernetes/pkg/controller",
deps = [
"//pkg/api/legacyscheme:go_default_library",
"//pkg/api/v1/pod:go_default_library",
"//pkg/apis/core:go_default_library",
"//pkg/apis/core/install:go_default_library",
"//pkg/apis/core/validation:go_default_library",
"//pkg/serviceaccount:go_default_library",
"//pkg/util/hash:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/authentication/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/meta:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/strategicpatch:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/watch:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/authentication/serviceaccount:go_default_library",
"//staging/src/k8s.io/client-go/informers:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/authentication/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/metadata/metadatainformer:go_default_library",
"//staging/src/k8s.io/client-go/rest:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//staging/src/k8s.io/client-go/tools/watch:go_default_library",
"//staging/src/k8s.io/client-go/transport:go_default_library",
"//staging/src/k8s.io/client-go/util/retry:go_default_library",
"//vendor/github.com/golang/groupcache/lru:go_default_library",
"//vendor/golang.org/x/oauth2:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
"//vendor/k8s.io/utils/integer:go_default_library",
"//vendor/k8s.io/utils/pointer:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
@ -112,7 +98,6 @@ filegroup(
"//pkg/controller/apis/config:all-srcs",
"//pkg/controller/bootstrap:all-srcs",
"//pkg/controller/certificates:all-srcs",
"//pkg/controller/cloud:all-srcs",
"//pkg/controller/clusterroleaggregation:all-srcs",
"//pkg/controller/cronjob:all-srcs",
"//pkg/controller/daemon:all-srcs",
@ -120,6 +105,7 @@ filegroup(
"//pkg/controller/disruption:all-srcs",
"//pkg/controller/endpoint:all-srcs",
"//pkg/controller/endpointslice:all-srcs",
"//pkg/controller/endpointslicemirroring:all-srcs",
"//pkg/controller/garbagecollector:all-srcs",
"//pkg/controller/history:all-srcs",
"//pkg/controller/job:all-srcs",
@ -131,16 +117,17 @@ filegroup(
"//pkg/controller/replicaset:all-srcs",
"//pkg/controller/replication:all-srcs",
"//pkg/controller/resourcequota:all-srcs",
"//pkg/controller/route:all-srcs",
"//pkg/controller/service:all-srcs",
"//pkg/controller/serviceaccount:all-srcs",
"//pkg/controller/statefulset:all-srcs",
"//pkg/controller/storageversiongc:all-srcs",
"//pkg/controller/testutil:all-srcs",
"//pkg/controller/ttl:all-srcs",
"//pkg/controller/ttlafterfinished:all-srcs",
"//pkg/controller/util/endpoint:all-srcs",
"//pkg/controller/util/node:all-srcs",
"//pkg/controller/volume/attachdetach:all-srcs",
"//pkg/controller/volume/common:all-srcs",
"//pkg/controller/volume/ephemeral:all-srcs",
"//pkg/controller/volume/events:all-srcs",
"//pkg/controller/volume/expand:all-srcs",
"//pkg/controller/volume/persistentvolume:all-srcs",
@ -150,4 +137,5 @@ filegroup(
"//pkg/controller/volume/scheduling:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -5,8 +5,11 @@ approvers:
- derekwaynecarr
- mikedanese
- janetkuo
- cheftako
- sig-apps-approvers
reviewers:
- deads2k
- cheftako
- sig-apps-reviewers
labels:
- sig/apps

View File

@ -1,241 +0,0 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"context"
"fmt"
"time"
v1authenticationapi "k8s.io/api/authentication/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
apiserverserviceaccount "k8s.io/apiserver/pkg/authentication/serviceaccount"
clientset "k8s.io/client-go/kubernetes"
v1authentication "k8s.io/client-go/kubernetes/typed/authentication/v1"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
watchtools "k8s.io/client-go/tools/watch"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/api/legacyscheme"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/serviceaccount"
)
// ControllerClientBuilder allows you to get clients and configs for controllers
// Please note a copy also exists in staging/src/k8s.io/cloud-provider/cloud.go
// TODO: Extract this into a separate controller utilities repo (issues/68947)
type ControllerClientBuilder interface {
Config(name string) (*restclient.Config, error)
ConfigOrDie(name string) *restclient.Config
Client(name string) (clientset.Interface, error)
ClientOrDie(name string) clientset.Interface
}
// SimpleControllerClientBuilder returns a fixed client with different user agents
type SimpleControllerClientBuilder struct {
// ClientConfig is a skeleton config to clone and use as the basis for each controller client
ClientConfig *restclient.Config
}
func (b SimpleControllerClientBuilder) Config(name string) (*restclient.Config, error) {
clientConfig := *b.ClientConfig
return restclient.AddUserAgent(&clientConfig, name), nil
}
func (b SimpleControllerClientBuilder) ConfigOrDie(name string) *restclient.Config {
clientConfig, err := b.Config(name)
if err != nil {
klog.Fatal(err)
}
return clientConfig
}
func (b SimpleControllerClientBuilder) Client(name string) (clientset.Interface, error) {
clientConfig, err := b.Config(name)
if err != nil {
return nil, err
}
return clientset.NewForConfig(clientConfig)
}
func (b SimpleControllerClientBuilder) ClientOrDie(name string) clientset.Interface {
client, err := b.Client(name)
if err != nil {
klog.Fatal(err)
}
return client
}
// SAControllerClientBuilder is a ControllerClientBuilder that returns clients identifying as
// service accounts
type SAControllerClientBuilder struct {
// ClientConfig is a skeleton config to clone and use as the basis for each controller client
ClientConfig *restclient.Config
// CoreClient is used to provision service accounts if needed and watch for their associated tokens
// to construct a controller client
CoreClient v1core.CoreV1Interface
// AuthenticationClient is used to check API tokens to make sure they are valid before
// building a controller client from them
AuthenticationClient v1authentication.AuthenticationV1Interface
// Namespace is the namespace used to host the service accounts that will back the
// controllers. It must be highly privileged namespace which normal users cannot inspect.
Namespace string
}
// config returns a complete clientConfig for constructing clients. This is separate in anticipation of composition
// which means that not all clientsets are known here
func (b SAControllerClientBuilder) Config(name string) (*restclient.Config, error) {
sa, err := getOrCreateServiceAccount(b.CoreClient, b.Namespace, name)
if err != nil {
return nil, err
}
var clientConfig *restclient.Config
fieldSelector := fields.SelectorFromSet(map[string]string{
api.SecretTypeField: string(v1.SecretTypeServiceAccountToken),
}).String()
lw := &cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
options.FieldSelector = fieldSelector
return b.CoreClient.Secrets(b.Namespace).List(context.TODO(), options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
options.FieldSelector = fieldSelector
return b.CoreClient.Secrets(b.Namespace).Watch(context.TODO(), options)
},
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
_, err = watchtools.UntilWithSync(ctx, lw, &v1.Secret{}, nil,
func(event watch.Event) (bool, error) {
switch event.Type {
case watch.Deleted:
return false, nil
case watch.Error:
return false, fmt.Errorf("error watching")
case watch.Added, watch.Modified:
secret, ok := event.Object.(*v1.Secret)
if !ok {
return false, fmt.Errorf("unexpected object type: %T", event.Object)
}
if !serviceaccount.IsServiceAccountToken(secret, sa) {
return false, nil
}
if len(secret.Data[v1.ServiceAccountTokenKey]) == 0 {
return false, nil
}
validConfig, valid, err := b.getAuthenticatedConfig(sa, string(secret.Data[v1.ServiceAccountTokenKey]))
if err != nil {
klog.Warningf("error validating API token for %s/%s in secret %s: %v", sa.Namespace, sa.Name, secret.Name, err)
// continue watching for good tokens
return false, nil
}
if !valid {
klog.Warningf("secret %s contained an invalid API token for %s/%s", secret.Name, sa.Namespace, sa.Name)
// try to delete the secret containing the invalid token
if err := b.CoreClient.Secrets(secret.Namespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
klog.Warningf("error deleting secret %s containing invalid API token for %s/%s: %v", secret.Name, sa.Namespace, sa.Name, err)
}
// continue watching for good tokens
return false, nil
}
clientConfig = validConfig
return true, nil
default:
return false, fmt.Errorf("unexpected event type: %v", event.Type)
}
})
if err != nil {
return nil, fmt.Errorf("unable to get token for service account: %v", err)
}
return clientConfig, nil
}
func (b SAControllerClientBuilder) getAuthenticatedConfig(sa *v1.ServiceAccount, token string) (*restclient.Config, bool, error) {
username := apiserverserviceaccount.MakeUsername(sa.Namespace, sa.Name)
clientConfig := restclient.AnonymousClientConfig(b.ClientConfig)
clientConfig.BearerToken = token
restclient.AddUserAgent(clientConfig, username)
// Try token review first
tokenReview := &v1authenticationapi.TokenReview{Spec: v1authenticationapi.TokenReviewSpec{Token: token}}
if tokenResult, err := b.AuthenticationClient.TokenReviews().Create(context.TODO(), tokenReview, metav1.CreateOptions{}); err == nil {
if !tokenResult.Status.Authenticated {
klog.Warningf("Token for %s/%s did not authenticate correctly", sa.Namespace, sa.Name)
return nil, false, nil
}
if tokenResult.Status.User.Username != username {
klog.Warningf("Token for %s/%s authenticated as unexpected username: %s", sa.Namespace, sa.Name, tokenResult.Status.User.Username)
return nil, false, nil
}
klog.V(4).Infof("Verified credential for %s/%s", sa.Namespace, sa.Name)
return clientConfig, true, nil
}
// If we couldn't run the token review, the API might be disabled or we might not have permission.
// Try to make a request to /apis with the token. If we get a 401 we should consider the token invalid.
clientConfigCopy := *clientConfig
clientConfigCopy.NegotiatedSerializer = legacyscheme.Codecs
client, err := restclient.UnversionedRESTClientFor(&clientConfigCopy)
if err != nil {
return nil, false, err
}
err = client.Get().AbsPath("/apis").Do(context.TODO()).Error()
if apierrors.IsUnauthorized(err) {
klog.Warningf("Token for %s/%s did not authenticate correctly: %v", sa.Namespace, sa.Name, err)
return nil, false, nil
}
return clientConfig, true, nil
}
func (b SAControllerClientBuilder) ConfigOrDie(name string) *restclient.Config {
clientConfig, err := b.Config(name)
if err != nil {
klog.Fatal(err)
}
return clientConfig
}
func (b SAControllerClientBuilder) Client(name string) (clientset.Interface, error) {
clientConfig, err := b.Config(name)
if err != nil {
return nil, err
}
return clientset.NewForConfig(clientConfig)
}
func (b SAControllerClientBuilder) ClientOrDie(name string) clientset.Interface {
client, err := b.Client(name)
if err != nil {
klog.Fatal(err)
}
return client
}

View File

@ -24,7 +24,6 @@ import (
"time"
"golang.org/x/oauth2"
v1authenticationapi "k8s.io/api/authentication/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/clock"
@ -34,7 +33,8 @@ import (
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/transport"
"k8s.io/klog"
"k8s.io/controller-manager/pkg/clientbuilder"
"k8s.io/klog/v2"
utilpointer "k8s.io/utils/pointer"
)
@ -73,7 +73,7 @@ type DynamicControllerClientBuilder struct {
clock clock.Clock
}
func NewDynamicClientBuilder(clientConfig *restclient.Config, coreClient v1core.CoreV1Interface, ns string) ControllerClientBuilder {
func NewDynamicClientBuilder(clientConfig *restclient.Config, coreClient v1core.CoreV1Interface, ns string) clientbuilder.ControllerClientBuilder {
builder := &DynamicControllerClientBuilder{
ClientConfig: clientConfig,
CoreClient: coreClient,
@ -87,7 +87,7 @@ func NewDynamicClientBuilder(clientConfig *restclient.Config, coreClient v1core.
}
// this function only for test purpose, don't call it
func NewTestDynamicClientBuilder(clientConfig *restclient.Config, coreClient v1core.CoreV1Interface, ns string, expirationSeconds int64, leewayPercent int) ControllerClientBuilder {
func NewTestDynamicClientBuilder(clientConfig *restclient.Config, coreClient v1core.CoreV1Interface, ns string, expirationSeconds int64, leewayPercent int) clientbuilder.ControllerClientBuilder {
builder := &DynamicControllerClientBuilder{
ClientConfig: clientConfig,
CoreClient: coreClient,

View File

@ -29,7 +29,7 @@ import (
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog"
"k8s.io/klog/v2"
)
type BaseControllerRefManager struct {

View File

@ -51,7 +51,7 @@ import (
taintutils "k8s.io/kubernetes/pkg/util/taints"
"k8s.io/utils/integer"
"k8s.io/klog"
"k8s.io/klog/v2"
)
const (
@ -601,8 +601,12 @@ func (r RealPodControl) DeletePod(namespace string, podID string, object runtime
if err != nil {
return fmt.Errorf("object does not have ObjectMeta, %v", err)
}
klog.V(2).Infof("Controller %v deleting pod %v/%v", accessor.GetName(), namespace, podID)
if err := r.KubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), podID, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
klog.V(2).InfoS("Deleting pod", "controller", accessor.GetName(), "pod", klog.KRef(namespace, podID))
if err := r.KubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), podID, metav1.DeleteOptions{}); err != nil {
if apierrors.IsNotFound(err) {
klog.V(4).Infof("pod %v/%v has already been deleted.", namespace, podID)
return err
}
r.Recorder.Eventf(object, v1.EventTypeWarning, FailedDeletePodReason, "Error deleting: %v", err)
return fmt.Errorf("unable to delete pods: %v", err)
}

View File

@ -25,7 +25,7 @@ go_library(
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
"//vendor/k8s.io/utils/integer:go_default_library",
],
)

View File

@ -37,7 +37,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
appsclient "k8s.io/client-go/kubernetes/typed/apps/v1"
appslisters "k8s.io/client-go/listers/apps/v1"
"k8s.io/klog"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller"
labelsutil "k8s.io/kubernetes/pkg/util/labels"
"k8s.io/utils/integer"
@ -817,7 +817,7 @@ func NewRSNewReplicas(deployment *apps.Deployment, allRSs []*apps.ReplicaSet, ne
switch deployment.Spec.Strategy.Type {
case apps.RollingUpdateDeploymentStrategyType:
// Check if we can scale up.
maxSurge, err := intstrutil.GetValueFromIntOrPercent(deployment.Spec.Strategy.RollingUpdate.MaxSurge, int(*(deployment.Spec.Replicas)), true)
maxSurge, err := intstrutil.GetScaledValueFromIntOrPercent(deployment.Spec.Strategy.RollingUpdate.MaxSurge, int(*(deployment.Spec.Replicas)), true)
if err != nil {
return 0, err
}
@ -881,11 +881,11 @@ func WaitForObservedDeployment(getDeploymentFunc func() (*apps.Deployment, error
// 2 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1)
// 1 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1)
func ResolveFenceposts(maxSurge, maxUnavailable *intstrutil.IntOrString, desired int32) (int32, int32, error) {
surge, err := intstrutil.GetValueFromIntOrPercent(intstrutil.ValueOrDefault(maxSurge, intstrutil.FromInt(0)), int(desired), true)
surge, err := intstrutil.GetScaledValueFromIntOrPercent(intstrutil.ValueOrDefault(maxSurge, intstrutil.FromInt(0)), int(desired), true)
if err != nil {
return 0, 0, err
}
unavailable, err := intstrutil.GetValueFromIntOrPercent(intstrutil.ValueOrDefault(maxUnavailable, intstrutil.FromInt(0)), int(desired), false)
unavailable, err := intstrutil.GetScaledValueFromIntOrPercent(intstrutil.ValueOrDefault(maxUnavailable, intstrutil.FromInt(0)), int(desired), false)
if err != nil {
return 0, 0, err
}

View File

@ -1,56 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/informers"
"k8s.io/client-go/metadata/metadatainformer"
)
// InformerFactory creates informers for each group version resource.
type InformerFactory interface {
ForResource(resource schema.GroupVersionResource) (informers.GenericInformer, error)
Start(stopCh <-chan struct{})
}
type informerFactory struct {
typedInformerFactory informers.SharedInformerFactory
metadataInformerFactory metadatainformer.SharedInformerFactory
}
func (i *informerFactory) ForResource(resource schema.GroupVersionResource) (informers.GenericInformer, error) {
informer, err := i.typedInformerFactory.ForResource(resource)
if err != nil {
return i.metadataInformerFactory.ForResource(resource), nil
}
return informer, nil
}
func (i *informerFactory) Start(stopCh <-chan struct{}) {
i.typedInformerFactory.Start(stopCh)
i.metadataInformerFactory.Start(stopCh)
}
// NewInformerFactory creates a new InformerFactory which works with both typed
// resources and metadata-only resources
func NewInformerFactory(typedInformerFactory informers.SharedInformerFactory, metadataInformerFactory metadatainformer.SharedInformerFactory) InformerFactory {
return &informerFactory{
typedInformerFactory: typedInformerFactory,
metadataInformerFactory: metadataInformerFactory,
}
}

View File

@ -1,100 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"metrics.go",
"node_lifecycle_controller.go",
],
importpath = "k8s.io/kubernetes/pkg/controller/nodelifecycle",
visibility = ["//visibility:public"],
deps = [
"//pkg/controller:go_default_library",
"//pkg/controller/nodelifecycle/scheduler:go_default_library",
"//pkg/controller/util/node:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/apis:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/coordination/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/informers/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/coordination/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/coordination/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//staging/src/k8s.io/client-go/util/flowcontrol:go_default_library",
"//staging/src/k8s.io/client-go/util/workqueue:go_default_library",
"//staging/src/k8s.io/component-base/metrics:go_default_library",
"//staging/src/k8s.io/component-base/metrics/legacyregistry:go_default_library",
"//staging/src/k8s.io/component-base/metrics/prometheus/ratelimiter:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = ["node_lifecycle_controller_test.go"],
embed = [":go_default_library"],
deps = [
"//pkg/controller:go_default_library",
"//pkg/controller/nodelifecycle/scheduler:go_default_library",
"//pkg/controller/testutil:go_default_library",
"//pkg/controller/util/node:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/apis:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/coordination/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/informers:go_default_library",
"//staging/src/k8s.io/client-go/informers/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/coordination/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library",
"//staging/src/k8s.io/client-go/testing:go_default_library",
"//staging/src/k8s.io/component-base/featuregate/testing:go_default_library",
"//vendor/k8s.io/utils/pointer:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/controller/nodelifecycle/config:all-srcs",
"//pkg/controller/nodelifecycle/scheduler:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,12 +0,0 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- gmarek
- bowei
- k82cn
reviewers:
- gmarek
- smarterclayton
- ingvagabund
- aveshagarwal
- k82cn

View File

@ -1,83 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodelifecycle
import (
"sync"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
const (
nodeControllerSubsystem = "node_collector"
zoneHealthStatisticKey = "zone_health"
zoneSizeKey = "zone_size"
zoneNoUnhealthyNodesKey = "unhealthy_nodes_in_zone"
evictionsNumberKey = "evictions_number"
)
var (
zoneHealth = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneHealthStatisticKey,
Help: "Gauge measuring percentage of healthy nodes per zone.",
StabilityLevel: metrics.ALPHA,
},
[]string{"zone"},
)
zoneSize = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneSizeKey,
Help: "Gauge measuring number of registered Nodes per zones.",
StabilityLevel: metrics.ALPHA,
},
[]string{"zone"},
)
unhealthyNodes = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneNoUnhealthyNodesKey,
Help: "Gauge measuring number of not Ready Nodes per zones.",
StabilityLevel: metrics.ALPHA,
},
[]string{"zone"},
)
evictionsNumber = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: nodeControllerSubsystem,
Name: evictionsNumberKey,
Help: "Number of Node evictions that happened since current instance of NodeController started.",
StabilityLevel: metrics.ALPHA,
},
[]string{"zone"},
)
)
var registerMetrics sync.Once
// Register the metrics that are to be monitored.
func Register() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(zoneHealth)
legacyregistry.MustRegister(zoneSize)
legacyregistry.MustRegister(unhealthyNodes)
legacyregistry.MustRegister(evictionsNumber)
})
}

File diff suppressed because it is too large Load Diff

View File

@ -1,64 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"rate_limited_queue.go",
"taint_manager.go",
"timed_workers.go",
],
importpath = "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler",
visibility = ["//visibility:public"],
deps = [
"//pkg/apis/core/helper:go_default_library",
"//pkg/apis/core/v1/helper:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//staging/src/k8s.io/client-go/util/flowcontrol:go_default_library",
"//staging/src/k8s.io/client-go/util/workqueue:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = [
"rate_limited_queue_test.go",
"taint_manager_test.go",
"timed_workers_test.go",
],
embed = [":go_default_library"],
deps = [
"//pkg/controller/testutil:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library",
"//staging/src/k8s.io/client-go/testing:go_default_library",
"//staging/src/k8s.io/client-go/util/flowcontrol:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,309 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"container/heap"
"sync"
"time"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/flowcontrol"
"k8s.io/klog"
)
const (
// NodeHealthUpdateRetry controls the number of retries of writing
// node health update.
NodeHealthUpdateRetry = 5
// NodeEvictionPeriod controls how often NodeController will try to
// evict Pods from non-responsive Nodes.
NodeEvictionPeriod = 100 * time.Millisecond
// EvictionRateLimiterBurst is the burst value for all eviction rate
// limiters
EvictionRateLimiterBurst = 1
)
// TimedValue is a value that should be processed at a designated time.
type TimedValue struct {
Value string
// UID could be anything that helps identify the value
UID interface{}
AddedAt time.Time
ProcessAt time.Time
}
// now is used to test time
var now = time.Now
// TimedQueue is a priority heap where the lowest ProcessAt is at the front of the queue
type TimedQueue []*TimedValue
// Len is the length of the queue.
func (h TimedQueue) Len() int { return len(h) }
// Less returns true if queue[i] < queue[j].
func (h TimedQueue) Less(i, j int) bool { return h[i].ProcessAt.Before(h[j].ProcessAt) }
// Swap swaps index i and j.
func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
// Push a new TimedValue on to the queue.
func (h *TimedQueue) Push(x interface{}) {
*h = append(*h, x.(*TimedValue))
}
// Pop the lowest ProcessAt item.
func (h *TimedQueue) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// UniqueQueue is a FIFO queue which additionally guarantees that any
// element can be added only once until it is removed.
type UniqueQueue struct {
lock sync.Mutex
queue TimedQueue
set sets.String
}
// Add a new value to the queue if it wasn't added before, or was
// explicitly removed by the Remove call. Returns true if new value
// was added.
func (q *UniqueQueue) Add(value TimedValue) bool {
q.lock.Lock()
defer q.lock.Unlock()
if q.set.Has(value.Value) {
return false
}
heap.Push(&q.queue, &value)
q.set.Insert(value.Value)
return true
}
// Replace replaces an existing value in the queue if it already
// exists, otherwise it does nothing. Returns true if the item was
// found.
func (q *UniqueQueue) Replace(value TimedValue) bool {
q.lock.Lock()
defer q.lock.Unlock()
for i := range q.queue {
if q.queue[i].Value != value.Value {
continue
}
heap.Remove(&q.queue, i)
heap.Push(&q.queue, &value)
return true
}
return false
}
// RemoveFromQueue the value from the queue, but keeps it in the set,
// so it won't be added second time. Returns true if something was
// removed.
func (q *UniqueQueue) RemoveFromQueue(value string) bool {
q.lock.Lock()
defer q.lock.Unlock()
if !q.set.Has(value) {
return false
}
for i, val := range q.queue {
if val.Value == value {
heap.Remove(&q.queue, i)
return true
}
}
return false
}
// Remove the value from the queue, so Get() call won't return it, and
// allow subsequent addition of the given value. If the value is not
// present does nothing and returns false.
func (q *UniqueQueue) Remove(value string) bool {
q.lock.Lock()
defer q.lock.Unlock()
if !q.set.Has(value) {
return false
}
q.set.Delete(value)
for i, val := range q.queue {
if val.Value == value {
heap.Remove(&q.queue, i)
return true
}
}
return true
}
// Get returns the oldest added value that wasn't returned yet.
func (q *UniqueQueue) Get() (TimedValue, bool) {
q.lock.Lock()
defer q.lock.Unlock()
if len(q.queue) == 0 {
return TimedValue{}, false
}
result := heap.Pop(&q.queue).(*TimedValue)
q.set.Delete(result.Value)
return *result, true
}
// Head returns the oldest added value that wasn't returned yet
// without removing it.
func (q *UniqueQueue) Head() (TimedValue, bool) {
q.lock.Lock()
defer q.lock.Unlock()
if len(q.queue) == 0 {
return TimedValue{}, false
}
result := q.queue[0]
return *result, true
}
// Clear removes all items from the queue and duplication preventing
// set.
func (q *UniqueQueue) Clear() {
q.lock.Lock()
defer q.lock.Unlock()
if q.queue.Len() > 0 {
q.queue = make(TimedQueue, 0)
}
if len(q.set) > 0 {
q.set = sets.NewString()
}
}
// RateLimitedTimedQueue is a unique item priority queue ordered by
// the expected next time of execution. It is also rate limited.
type RateLimitedTimedQueue struct {
queue UniqueQueue
limiterLock sync.Mutex
limiter flowcontrol.RateLimiter
}
// NewRateLimitedTimedQueue creates new queue which will use given
// RateLimiter to oversee execution.
func NewRateLimitedTimedQueue(limiter flowcontrol.RateLimiter) *RateLimitedTimedQueue {
return &RateLimitedTimedQueue{
queue: UniqueQueue{
queue: TimedQueue{},
set: sets.NewString(),
},
limiter: limiter,
}
}
// ActionFunc takes a timed value and returns false if the item must
// be retried, with an optional time.Duration if some minimum wait
// interval should be used.
type ActionFunc func(TimedValue) (bool, time.Duration)
// Try processes the queue.Ends prematurely if RateLimiter forbids an
// action and leak is true. Otherwise, requeues the item to be
// processed. Each value is processed once if fn returns true,
// otherwise it is added back to the queue. The returned remaining is
// used to identify the minimum time to execute the next item in the
// queue. The same value is processed only once unless Remove is
// explicitly called on it (it's done by the cancelPodEviction
// function in NodeController when Node becomes Ready again) TODO:
// figure out a good way to do garbage collection for all Nodes that
// were removed from the cluster.
func (q *RateLimitedTimedQueue) Try(fn ActionFunc) {
val, ok := q.queue.Head()
q.limiterLock.Lock()
defer q.limiterLock.Unlock()
for ok {
// rate limit the queue checking
if !q.limiter.TryAccept() {
klog.V(10).Infof("Try rate limited for value: %v", val)
// Try again later
break
}
now := now()
if now.Before(val.ProcessAt) {
break
}
if ok, wait := fn(val); !ok {
val.ProcessAt = now.Add(wait + 1)
q.queue.Replace(val)
} else {
q.queue.RemoveFromQueue(val.Value)
}
val, ok = q.queue.Head()
}
}
// Add value to the queue to be processed. Won't add the same
// value(comparison by value) a second time if it was already added
// and not removed.
func (q *RateLimitedTimedQueue) Add(value string, uid interface{}) bool {
now := now()
return q.queue.Add(TimedValue{
Value: value,
UID: uid,
AddedAt: now,
ProcessAt: now,
})
}
// Remove Node from the Evictor. The Node won't be processed until
// added again.
func (q *RateLimitedTimedQueue) Remove(value string) bool {
return q.queue.Remove(value)
}
// Clear removes all items from the queue
func (q *RateLimitedTimedQueue) Clear() {
q.queue.Clear()
}
// SwapLimiter safely swaps current limiter for this queue with the
// passed one if capacities or qps's differ.
func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) {
q.limiterLock.Lock()
defer q.limiterLock.Unlock()
if q.limiter.QPS() == newQPS {
return
}
var newLimiter flowcontrol.RateLimiter
if newQPS <= 0 {
newLimiter = flowcontrol.NewFakeNeverRateLimiter()
} else {
newLimiter = flowcontrol.NewTokenBucketRateLimiter(newQPS, EvictionRateLimiterBurst)
// If we're currently waiting on limiter, we drain the new one - this is a good approach when Burst value is 1
// TODO: figure out if we need to support higher Burst values and decide on the drain logic, should we keep:
// - saturation (percentage of used tokens)
// - number of used tokens
// - number of available tokens
// - something else
if q.limiter.TryAccept() == false {
newLimiter.TryAccept()
}
}
q.limiter.Stop()
q.limiter = newLimiter
}

View File

@ -1,496 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"context"
"fmt"
"hash/fnv"
"io"
"math"
"sync"
"time"
"k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/apis/core/helper"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/klog"
)
const (
// TODO (k82cn): Figure out a reasonable number of workers/channels and propagate
// the number of workers up making it a parameter of Run() function.
// NodeUpdateChannelSize defines the size of channel for node update events.
NodeUpdateChannelSize = 10
// UpdateWorkerSize defines the size of workers for node update or/and pod update.
UpdateWorkerSize = 8
podUpdateChannelSize = 1
retries = 5
)
type nodeUpdateItem struct {
nodeName string
}
type podUpdateItem struct {
podName string
podNamespace string
nodeName string
}
func hash(val string, max int) int {
hasher := fnv.New32a()
io.WriteString(hasher, val)
return int(hasher.Sum32() % uint32(max))
}
// GetPodFunc returns the pod for the specified name/namespace, or a NotFound error if missing.
type GetPodFunc func(name, namespace string) (*v1.Pod, error)
// GetNodeFunc returns the node for the specified name, or a NotFound error if missing.
type GetNodeFunc func(name string) (*v1.Node, error)
// GetPodsByNodeNameFunc returns the list of pods assigned to the specified node.
type GetPodsByNodeNameFunc func(nodeName string) ([]*v1.Pod, error)
// NoExecuteTaintManager listens to Taint/Toleration changes and is responsible for removing Pods
// from Nodes tainted with NoExecute Taints.
type NoExecuteTaintManager struct {
client clientset.Interface
recorder record.EventRecorder
getPod GetPodFunc
getNode GetNodeFunc
getPodsAssignedToNode GetPodsByNodeNameFunc
taintEvictionQueue *TimedWorkerQueue
// keeps a map from nodeName to all noExecute taints on that Node
taintedNodesLock sync.Mutex
taintedNodes map[string][]v1.Taint
nodeUpdateChannels []chan nodeUpdateItem
podUpdateChannels []chan podUpdateItem
nodeUpdateQueue workqueue.Interface
podUpdateQueue workqueue.Interface
}
func deletePodHandler(c clientset.Interface, emitEventFunc func(types.NamespacedName)) func(args *WorkArgs) error {
return func(args *WorkArgs) error {
ns := args.NamespacedName.Namespace
name := args.NamespacedName.Name
klog.V(0).Infof("NoExecuteTaintManager is deleting Pod: %v", args.NamespacedName.String())
if emitEventFunc != nil {
emitEventFunc(args.NamespacedName)
}
var err error
for i := 0; i < retries; i++ {
err = c.CoreV1().Pods(ns).Delete(context.TODO(), name, metav1.DeleteOptions{})
if err == nil {
break
}
time.Sleep(10 * time.Millisecond)
}
return err
}
}
func getNoExecuteTaints(taints []v1.Taint) []v1.Taint {
result := []v1.Taint{}
for i := range taints {
if taints[i].Effect == v1.TaintEffectNoExecute {
result = append(result, taints[i])
}
}
return result
}
// getMinTolerationTime returns minimal toleration time from the given slice, or -1 if it's infinite.
func getMinTolerationTime(tolerations []v1.Toleration) time.Duration {
minTolerationTime := int64(math.MaxInt64)
if len(tolerations) == 0 {
return 0
}
for i := range tolerations {
if tolerations[i].TolerationSeconds != nil {
tolerationSeconds := *(tolerations[i].TolerationSeconds)
if tolerationSeconds <= 0 {
return 0
} else if tolerationSeconds < minTolerationTime {
minTolerationTime = tolerationSeconds
}
}
}
if minTolerationTime == int64(math.MaxInt64) {
return -1
}
return time.Duration(minTolerationTime) * time.Second
}
// NewNoExecuteTaintManager creates a new NoExecuteTaintManager that will use passed clientset to
// communicate with the API server.
func NewNoExecuteTaintManager(c clientset.Interface, getPod GetPodFunc, getNode GetNodeFunc, getPodsAssignedToNode GetPodsByNodeNameFunc) *NoExecuteTaintManager {
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "taint-controller"})
eventBroadcaster.StartLogging(klog.Infof)
if c != nil {
klog.V(0).Infof("Sending events to api server.")
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: c.CoreV1().Events("")})
} else {
klog.Fatalf("kubeClient is nil when starting NodeController")
}
tm := &NoExecuteTaintManager{
client: c,
recorder: recorder,
getPod: getPod,
getNode: getNode,
getPodsAssignedToNode: getPodsAssignedToNode,
taintedNodes: make(map[string][]v1.Taint),
nodeUpdateQueue: workqueue.NewNamed("noexec_taint_node"),
podUpdateQueue: workqueue.NewNamed("noexec_taint_pod"),
}
tm.taintEvictionQueue = CreateWorkerQueue(deletePodHandler(c, tm.emitPodDeletionEvent))
return tm
}
// Run starts NoExecuteTaintManager which will run in loop until `stopCh` is closed.
func (tc *NoExecuteTaintManager) Run(stopCh <-chan struct{}) {
klog.V(0).Infof("Starting NoExecuteTaintManager")
for i := 0; i < UpdateWorkerSize; i++ {
tc.nodeUpdateChannels = append(tc.nodeUpdateChannels, make(chan nodeUpdateItem, NodeUpdateChannelSize))
tc.podUpdateChannels = append(tc.podUpdateChannels, make(chan podUpdateItem, podUpdateChannelSize))
}
// Functions that are responsible for taking work items out of the workqueues and putting them
// into channels.
go func(stopCh <-chan struct{}) {
for {
item, shutdown := tc.nodeUpdateQueue.Get()
if shutdown {
break
}
nodeUpdate := item.(nodeUpdateItem)
hash := hash(nodeUpdate.nodeName, UpdateWorkerSize)
select {
case <-stopCh:
tc.nodeUpdateQueue.Done(item)
return
case tc.nodeUpdateChannels[hash] <- nodeUpdate:
// tc.nodeUpdateQueue.Done is called by the nodeUpdateChannels worker
}
}
}(stopCh)
go func(stopCh <-chan struct{}) {
for {
item, shutdown := tc.podUpdateQueue.Get()
if shutdown {
break
}
// The fact that pods are processed by the same worker as nodes is used to avoid races
// between node worker setting tc.taintedNodes and pod worker reading this to decide
// whether to delete pod.
// It's possible that even without this assumption this code is still correct.
podUpdate := item.(podUpdateItem)
hash := hash(podUpdate.nodeName, UpdateWorkerSize)
select {
case <-stopCh:
tc.podUpdateQueue.Done(item)
return
case tc.podUpdateChannels[hash] <- podUpdate:
// tc.podUpdateQueue.Done is called by the podUpdateChannels worker
}
}
}(stopCh)
wg := sync.WaitGroup{}
wg.Add(UpdateWorkerSize)
for i := 0; i < UpdateWorkerSize; i++ {
go tc.worker(i, wg.Done, stopCh)
}
wg.Wait()
}
func (tc *NoExecuteTaintManager) worker(worker int, done func(), stopCh <-chan struct{}) {
defer done()
// When processing events we want to prioritize Node updates over Pod updates,
// as NodeUpdates that interest NoExecuteTaintManager should be handled as soon as possible -
// we don't want user (or system) to wait until PodUpdate queue is drained before it can
// start evicting Pods from tainted Nodes.
for {
select {
case <-stopCh:
return
case nodeUpdate := <-tc.nodeUpdateChannels[worker]:
tc.handleNodeUpdate(nodeUpdate)
tc.nodeUpdateQueue.Done(nodeUpdate)
case podUpdate := <-tc.podUpdateChannels[worker]:
// If we found a Pod update we need to empty Node queue first.
priority:
for {
select {
case nodeUpdate := <-tc.nodeUpdateChannels[worker]:
tc.handleNodeUpdate(nodeUpdate)
tc.nodeUpdateQueue.Done(nodeUpdate)
default:
break priority
}
}
// After Node queue is emptied we process podUpdate.
tc.handlePodUpdate(podUpdate)
tc.podUpdateQueue.Done(podUpdate)
}
}
}
// PodUpdated is used to notify NoExecuteTaintManager about Pod changes.
func (tc *NoExecuteTaintManager) PodUpdated(oldPod *v1.Pod, newPod *v1.Pod) {
podName := ""
podNamespace := ""
nodeName := ""
oldTolerations := []v1.Toleration{}
if oldPod != nil {
podName = oldPod.Name
podNamespace = oldPod.Namespace
nodeName = oldPod.Spec.NodeName
oldTolerations = oldPod.Spec.Tolerations
}
newTolerations := []v1.Toleration{}
if newPod != nil {
podName = newPod.Name
podNamespace = newPod.Namespace
nodeName = newPod.Spec.NodeName
newTolerations = newPod.Spec.Tolerations
}
if oldPod != nil && newPod != nil && helper.Semantic.DeepEqual(oldTolerations, newTolerations) && oldPod.Spec.NodeName == newPod.Spec.NodeName {
return
}
updateItem := podUpdateItem{
podName: podName,
podNamespace: podNamespace,
nodeName: nodeName,
}
tc.podUpdateQueue.Add(updateItem)
}
// NodeUpdated is used to notify NoExecuteTaintManager about Node changes.
func (tc *NoExecuteTaintManager) NodeUpdated(oldNode *v1.Node, newNode *v1.Node) {
nodeName := ""
oldTaints := []v1.Taint{}
if oldNode != nil {
nodeName = oldNode.Name
oldTaints = getNoExecuteTaints(oldNode.Spec.Taints)
}
newTaints := []v1.Taint{}
if newNode != nil {
nodeName = newNode.Name
newTaints = getNoExecuteTaints(newNode.Spec.Taints)
}
if oldNode != nil && newNode != nil && helper.Semantic.DeepEqual(oldTaints, newTaints) {
return
}
updateItem := nodeUpdateItem{
nodeName: nodeName,
}
tc.nodeUpdateQueue.Add(updateItem)
}
func (tc *NoExecuteTaintManager) cancelWorkWithEvent(nsName types.NamespacedName) {
if tc.taintEvictionQueue.CancelWork(nsName.String()) {
tc.emitCancelPodDeletionEvent(nsName)
}
}
func (tc *NoExecuteTaintManager) processPodOnNode(
podNamespacedName types.NamespacedName,
nodeName string,
tolerations []v1.Toleration,
taints []v1.Taint,
now time.Time,
) {
if len(taints) == 0 {
tc.cancelWorkWithEvent(podNamespacedName)
}
allTolerated, usedTolerations := v1helper.GetMatchingTolerations(taints, tolerations)
if !allTolerated {
klog.V(2).Infof("Not all taints are tolerated after update for Pod %v on %v", podNamespacedName.String(), nodeName)
// We're canceling scheduled work (if any), as we're going to delete the Pod right away.
tc.cancelWorkWithEvent(podNamespacedName)
tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), time.Now(), time.Now())
return
}
minTolerationTime := getMinTolerationTime(usedTolerations)
// getMinTolerationTime returns negative value to denote infinite toleration.
if minTolerationTime < 0 {
klog.V(4).Infof("New tolerations for %v tolerate forever. Scheduled deletion won't be cancelled if already scheduled.", podNamespacedName.String())
return
}
startTime := now
triggerTime := startTime.Add(minTolerationTime)
scheduledEviction := tc.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String())
if scheduledEviction != nil {
startTime = scheduledEviction.CreatedAt
if startTime.Add(minTolerationTime).Before(triggerTime) {
return
}
tc.cancelWorkWithEvent(podNamespacedName)
}
tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime)
}
func (tc *NoExecuteTaintManager) handlePodUpdate(podUpdate podUpdateItem) {
pod, err := tc.getPod(podUpdate.podName, podUpdate.podNamespace)
if err != nil {
if apierrors.IsNotFound(err) {
// Delete
podNamespacedName := types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName}
klog.V(4).Infof("Noticed pod deletion: %#v", podNamespacedName)
tc.cancelWorkWithEvent(podNamespacedName)
return
}
utilruntime.HandleError(fmt.Errorf("could not get pod %s/%s: %v", podUpdate.podName, podUpdate.podNamespace, err))
return
}
// We key the workqueue and shard workers by nodeName. If we don't match the current state we should not be the one processing the current object.
if pod.Spec.NodeName != podUpdate.nodeName {
return
}
// Create or Update
podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
klog.V(4).Infof("Noticed pod update: %#v", podNamespacedName)
nodeName := pod.Spec.NodeName
if nodeName == "" {
return
}
taints, ok := func() ([]v1.Taint, bool) {
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
taints, ok := tc.taintedNodes[nodeName]
return taints, ok
}()
// It's possible that Node was deleted, or Taints were removed before, which triggered
// eviction cancelling if it was needed.
if !ok {
return
}
tc.processPodOnNode(podNamespacedName, nodeName, pod.Spec.Tolerations, taints, time.Now())
}
func (tc *NoExecuteTaintManager) handleNodeUpdate(nodeUpdate nodeUpdateItem) {
node, err := tc.getNode(nodeUpdate.nodeName)
if err != nil {
if apierrors.IsNotFound(err) {
// Delete
klog.V(4).Infof("Noticed node deletion: %#v", nodeUpdate.nodeName)
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
delete(tc.taintedNodes, nodeUpdate.nodeName)
return
}
utilruntime.HandleError(fmt.Errorf("cannot get node %s: %v", nodeUpdate.nodeName, err))
return
}
// Create or Update
klog.V(4).Infof("Noticed node update: %#v", nodeUpdate)
taints := getNoExecuteTaints(node.Spec.Taints)
func() {
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
klog.V(4).Infof("Updating known taints on node %v: %v", node.Name, taints)
if len(taints) == 0 {
delete(tc.taintedNodes, node.Name)
} else {
tc.taintedNodes[node.Name] = taints
}
}()
// This is critical that we update tc.taintedNodes before we call getPodsAssignedToNode:
// getPodsAssignedToNode can be delayed as long as all future updates to pods will call
// tc.PodUpdated which will use tc.taintedNodes to potentially delete delayed pods.
pods, err := tc.getPodsAssignedToNode(node.Name)
if err != nil {
klog.Errorf(err.Error())
return
}
if len(pods) == 0 {
return
}
// Short circuit, to make this controller a bit faster.
if len(taints) == 0 {
klog.V(4).Infof("All taints were removed from the Node %v. Cancelling all evictions...", node.Name)
for i := range pods {
tc.cancelWorkWithEvent(types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name})
}
return
}
now := time.Now()
for _, pod := range pods {
podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
tc.processPodOnNode(podNamespacedName, node.Name, pod.Spec.Tolerations, taints, now)
}
}
func (tc *NoExecuteTaintManager) emitPodDeletionEvent(nsName types.NamespacedName) {
if tc.recorder == nil {
return
}
ref := &v1.ObjectReference{
Kind: "Pod",
Name: nsName.Name,
Namespace: nsName.Namespace,
}
tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Marking for deletion Pod %s", nsName.String())
}
func (tc *NoExecuteTaintManager) emitCancelPodDeletionEvent(nsName types.NamespacedName) {
if tc.recorder == nil {
return
}
ref := &v1.ObjectReference{
Kind: "Pod",
Name: nsName.Name,
Namespace: nsName.Namespace,
}
tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Cancelling deletion of Pod %s", nsName.String())
}

View File

@ -1,145 +0,0 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"sync"
"time"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
)
// WorkArgs keeps arguments that will be passed to the function executed by the worker.
type WorkArgs struct {
NamespacedName types.NamespacedName
}
// KeyFromWorkArgs creates a key for the given `WorkArgs`
func (w *WorkArgs) KeyFromWorkArgs() string {
return w.NamespacedName.String()
}
// NewWorkArgs is a helper function to create new `WorkArgs`
func NewWorkArgs(name, namespace string) *WorkArgs {
return &WorkArgs{types.NamespacedName{Namespace: namespace, Name: name}}
}
// TimedWorker is a responsible for executing a function no earlier than at FireAt time.
type TimedWorker struct {
WorkItem *WorkArgs
CreatedAt time.Time
FireAt time.Time
Timer *time.Timer
}
// CreateWorker creates a TimedWorker that will execute `f` not earlier than `fireAt`.
func CreateWorker(args *WorkArgs, createdAt time.Time, fireAt time.Time, f func(args *WorkArgs) error) *TimedWorker {
delay := fireAt.Sub(createdAt)
if delay <= 0 {
go f(args)
return nil
}
timer := time.AfterFunc(delay, func() { f(args) })
return &TimedWorker{
WorkItem: args,
CreatedAt: createdAt,
FireAt: fireAt,
Timer: timer,
}
}
// Cancel cancels the execution of function by the `TimedWorker`
func (w *TimedWorker) Cancel() {
if w != nil {
w.Timer.Stop()
}
}
// TimedWorkerQueue keeps a set of TimedWorkers that are still wait for execution.
type TimedWorkerQueue struct {
sync.Mutex
// map of workers keyed by string returned by 'KeyFromWorkArgs' from the given worker.
workers map[string]*TimedWorker
workFunc func(args *WorkArgs) error
}
// CreateWorkerQueue creates a new TimedWorkerQueue for workers that will execute
// given function `f`.
func CreateWorkerQueue(f func(args *WorkArgs) error) *TimedWorkerQueue {
return &TimedWorkerQueue{
workers: make(map[string]*TimedWorker),
workFunc: f,
}
}
func (q *TimedWorkerQueue) getWrappedWorkerFunc(key string) func(args *WorkArgs) error {
return func(args *WorkArgs) error {
err := q.workFunc(args)
q.Lock()
defer q.Unlock()
if err == nil {
// To avoid duplicated calls we keep the key in the queue, to prevent
// subsequent additions.
q.workers[key] = nil
} else {
delete(q.workers, key)
}
return err
}
}
// AddWork adds a work to the WorkerQueue which will be executed not earlier than `fireAt`.
func (q *TimedWorkerQueue) AddWork(args *WorkArgs, createdAt time.Time, fireAt time.Time) {
key := args.KeyFromWorkArgs()
klog.V(4).Infof("Adding TimedWorkerQueue item %v at %v to be fired at %v", key, createdAt, fireAt)
q.Lock()
defer q.Unlock()
if _, exists := q.workers[key]; exists {
klog.Warningf("Trying to add already existing work for %+v. Skipping.", args)
return
}
worker := CreateWorker(args, createdAt, fireAt, q.getWrappedWorkerFunc(key))
q.workers[key] = worker
}
// CancelWork removes scheduled function execution from the queue. Returns true if work was cancelled.
func (q *TimedWorkerQueue) CancelWork(key string) bool {
q.Lock()
defer q.Unlock()
worker, found := q.workers[key]
result := false
if found {
klog.V(4).Infof("Cancelling TimedWorkerQueue item %v at %v", key, time.Now())
if worker != nil {
result = true
worker.Cancel()
}
delete(q.workers, key)
}
return result
}
// GetWorkerUnsafe returns a TimedWorker corresponding to the given key.
// Unsafe method - workers have attached goroutines which can fire after this function is called.
func (q *TimedWorkerQueue) GetWorkerUnsafe(key string) *TimedWorker {
q.Lock()
defer q.Unlock()
return q.workers[key]
}

View File

@ -1,39 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["controller_utils.go"],
importpath = "k8s.io/kubernetes/pkg/controller/util/node",
visibility = ["//visibility:public"],
deps = [
"//pkg/api/v1/pod:go_default_library",
"//pkg/controller:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/util/node:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,299 +0,0 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package node
import (
"context"
"fmt"
"strings"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
appsv1listers "k8s.io/client-go/listers/apps/v1"
utilpod "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/kubelet/util/format"
nodepkg "k8s.io/kubernetes/pkg/util/node"
"k8s.io/klog"
)
// DeletePods will delete all pods from master running on given node,
// and return true if any pods were deleted, or were found pending
// deletion.
func DeletePods(kubeClient clientset.Interface, pods []*v1.Pod, recorder record.EventRecorder, nodeName, nodeUID string, daemonStore appsv1listers.DaemonSetLister) (bool, error) {
remaining := false
var updateErrList []error
if len(pods) > 0 {
RecordNodeEvent(recorder, nodeName, nodeUID, v1.EventTypeNormal, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeName))
}
for i := range pods {
// Defensive check, also needed for tests.
if pods[i].Spec.NodeName != nodeName {
continue
}
// Pod will be modified, so making copy is required.
pod := pods[i].DeepCopy()
// Set reason and message in the pod object.
if _, err := SetPodTerminationReason(kubeClient, pod, nodeName); err != nil {
if apierrors.IsConflict(err) {
updateErrList = append(updateErrList,
fmt.Errorf("update status failed for pod %q: %v", format.Pod(pod), err))
continue
}
}
// if the pod has already been marked for deletion, we still return true that there are remaining pods.
if pod.DeletionGracePeriodSeconds != nil {
remaining = true
continue
}
// if the pod is managed by a daemonset, ignore it
if _, err := daemonStore.GetPodDaemonSets(pod); err == nil {
// No error means at least one daemonset was found
continue
}
klog.V(2).Infof("Starting deletion of pod %v/%v", pod.Namespace, pod.Name)
recorder.Eventf(pod, v1.EventTypeNormal, "NodeControllerEviction", "Marking for deletion Pod %s from Node %s", pod.Name, nodeName)
if err := kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{}); err != nil {
if apierrors.IsNotFound(err) {
// NotFound error means that pod was already deleted.
// There is nothing left to do with this pod.
continue
}
return false, err
}
remaining = true
}
if len(updateErrList) > 0 {
return false, utilerrors.NewAggregate(updateErrList)
}
return remaining, nil
}
// SetPodTerminationReason attempts to set a reason and message in the
// pod status, updates it in the apiserver, and returns an error if it
// encounters one.
func SetPodTerminationReason(kubeClient clientset.Interface, pod *v1.Pod, nodeName string) (*v1.Pod, error) {
if pod.Status.Reason == nodepkg.NodeUnreachablePodReason {
return pod, nil
}
pod.Status.Reason = nodepkg.NodeUnreachablePodReason
pod.Status.Message = fmt.Sprintf(nodepkg.NodeUnreachablePodMessage, nodeName, pod.Name)
var updatedPod *v1.Pod
var err error
if updatedPod, err = kubeClient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
return nil, err
}
return updatedPod, nil
}
// MarkPodsNotReady updates ready status of given pods running on
// given node from master return true if success
func MarkPodsNotReady(kubeClient clientset.Interface, pods []*v1.Pod, nodeName string) error {
klog.V(2).Infof("Update ready status of pods on node [%v]", nodeName)
errMsg := []string{}
for i := range pods {
// Defensive check, also needed for tests.
if pods[i].Spec.NodeName != nodeName {
continue
}
// Pod will be modified, so making copy is required.
pod := pods[i].DeepCopy()
for _, cond := range pod.Status.Conditions {
if cond.Type == v1.PodReady {
cond.Status = v1.ConditionFalse
if !utilpod.UpdatePodCondition(&pod.Status, &cond) {
break
}
klog.V(2).Infof("Updating ready status of pod %v to false", pod.Name)
_, err := kubeClient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
// NotFound error means that pod was already deleted.
// There is nothing left to do with this pod.
continue
}
klog.Warningf("Failed to update status for pod %q: %v", format.Pod(pod), err)
errMsg = append(errMsg, fmt.Sprintf("%v", err))
}
break
}
}
}
if len(errMsg) == 0 {
return nil
}
return fmt.Errorf("%v", strings.Join(errMsg, "; "))
}
// RecordNodeEvent records a event related to a node.
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
ref := &v1.ObjectReference{
APIVersion: "v1",
Kind: "Node",
Name: nodeName,
UID: types.UID(nodeUID),
Namespace: "",
}
klog.V(2).Infof("Recording %s event message for node %s", event, nodeName)
recorder.Eventf(ref, eventtype, reason, "Node %s event: %s", nodeName, event)
}
// RecordNodeStatusChange records a event related to a node status change. (Common to lifecycle and ipam)
func RecordNodeStatusChange(recorder record.EventRecorder, node *v1.Node, newStatus string) {
ref := &v1.ObjectReference{
APIVersion: "v1",
Kind: "Node",
Name: node.Name,
UID: node.UID,
Namespace: "",
}
klog.V(2).Infof("Recording status change %s event message for node %s", newStatus, node.Name)
// TODO: This requires a transaction, either both node status is updated
// and event is recorded or neither should happen, see issue #6055.
recorder.Eventf(ref, v1.EventTypeNormal, newStatus, "Node %s status is now: %s", node.Name, newStatus)
}
// SwapNodeControllerTaint returns true in case of success and false
// otherwise.
func SwapNodeControllerTaint(kubeClient clientset.Interface, taintsToAdd, taintsToRemove []*v1.Taint, node *v1.Node) bool {
for _, taintToAdd := range taintsToAdd {
now := metav1.Now()
taintToAdd.TimeAdded = &now
}
err := controller.AddOrUpdateTaintOnNode(kubeClient, node.Name, taintsToAdd...)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to taint %+v unresponsive Node %q: %v",
taintsToAdd,
node.Name,
err))
return false
}
klog.V(4).Infof("Added %+v Taint to Node %v", taintsToAdd, node.Name)
err = controller.RemoveTaintOffNode(kubeClient, node.Name, node, taintsToRemove...)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to remove %+v unneeded taint from unresponsive Node %q: %v",
taintsToRemove,
node.Name,
err))
return false
}
klog.V(4).Infof("Made sure that Node %+v has no %v Taint", node.Name, taintsToRemove)
return true
}
// AddOrUpdateLabelsOnNode updates the labels on the node and returns true on
// success and false on failure.
func AddOrUpdateLabelsOnNode(kubeClient clientset.Interface, labelsToUpdate map[string]string, node *v1.Node) bool {
err := controller.AddOrUpdateLabelsOnNode(kubeClient, node.Name, labelsToUpdate)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to update labels %+v for Node %q: %v",
labelsToUpdate,
node.Name,
err))
return false
}
klog.V(4).Infof("Updated labels %+v to Node %v", labelsToUpdate, node.Name)
return true
}
// CreateAddNodeHandler creates an add node handler.
func CreateAddNodeHandler(f func(node *v1.Node) error) func(obj interface{}) {
return func(originalObj interface{}) {
node := originalObj.(*v1.Node).DeepCopy()
if err := f(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add: %v", err))
}
}
}
// CreateUpdateNodeHandler creates a node update handler. (Common to lifecycle and ipam)
func CreateUpdateNodeHandler(f func(oldNode, newNode *v1.Node) error) func(oldObj, newObj interface{}) {
return func(origOldObj, origNewObj interface{}) {
node := origNewObj.(*v1.Node).DeepCopy()
prevNode := origOldObj.(*v1.Node).DeepCopy()
if err := f(prevNode, node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add/Delete: %v", err))
}
}
}
// CreateDeleteNodeHandler creates a delete node handler. (Common to lifecycle and ipam)
func CreateDeleteNodeHandler(f func(node *v1.Node) error) func(obj interface{}) {
return func(originalObj interface{}) {
originalNode, isNode := originalObj.(*v1.Node)
// We can get DeletedFinalStateUnknown instead of *v1.Node here and
// we need to handle that correctly. #34692
if !isNode {
deletedState, ok := originalObj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Received unexpected object: %v", originalObj)
return
}
originalNode, ok = deletedState.Obj.(*v1.Node)
if !ok {
klog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
return
}
}
node := originalNode.DeepCopy()
if err := f(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add/Delete: %v", err))
}
}
}
// GetNodeCondition extracts the provided condition from the given status and returns that.
// Returns nil and -1 if the condition is not present, and the index of the located condition.
func GetNodeCondition(status *v1.NodeStatus, conditionType v1.NodeConditionType) (int, *v1.NodeCondition) {
if status == nil {
return -1, nil
}
for i := range status.Conditions {
if status.Conditions[i].Type == conditionType {
return i, &status.Conditions[i]
}
}
return -1, nil
}

View File

@ -1,37 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["util.go"],
importpath = "k8s.io/kubernetes/pkg/controller/volume/persistentvolume/util",
visibility = ["//visibility:public"],
deps = [
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/features:go_default_library",
"//pkg/volume/util:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/storage/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//staging/src/k8s.io/client-go/listers/storage/v1:go_default_library",
"//staging/src/k8s.io/client-go/tools/reference:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,360 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/kubernetes/scheme"
storagelisters "k8s.io/client-go/listers/storage/v1"
"k8s.io/client-go/tools/reference"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
volumeutil "k8s.io/kubernetes/pkg/volume/util"
)
const (
// AnnBindCompleted Annotation applies to PVCs. It indicates that the lifecycle
// of the PVC has passed through the initial setup. This information changes how
// we interpret some observations of the state of the objects. Value of this
// Annotation does not matter.
AnnBindCompleted = "pv.kubernetes.io/bind-completed"
// AnnBoundByController annotation applies to PVs and PVCs. It indicates that
// the binding (PV->PVC or PVC->PV) was installed by the controller. The
// absence of this annotation means the binding was done by the user (i.e.
// pre-bound). Value of this annotation does not matter.
// External PV binders must bind PV the same way as PV controller, otherwise PV
// controller may not handle it correctly.
AnnBoundByController = "pv.kubernetes.io/bound-by-controller"
// AnnSelectedNode annotation is added to a PVC that has been triggered by scheduler to
// be dynamically provisioned. Its value is the name of the selected node.
AnnSelectedNode = "volume.kubernetes.io/selected-node"
// NotSupportedProvisioner is a special provisioner name which can be set
// in storage class to indicate dynamic provisioning is not supported by
// the storage.
NotSupportedProvisioner = "kubernetes.io/no-provisioner"
// AnnDynamicallyProvisioned annotation is added to a PV that has been dynamically provisioned by
// Kubernetes. Its value is name of volume plugin that created the volume.
// It serves both user (to show where a PV comes from) and Kubernetes (to
// recognize dynamically provisioned PVs in its decisions).
AnnDynamicallyProvisioned = "pv.kubernetes.io/provisioned-by"
// AnnMigratedTo annotation is added to a PVC and PV that is supposed to be
// dynamically provisioned/deleted by by its corresponding CSI driver
// through the CSIMigration feature flags. When this annotation is set the
// Kubernetes components will "stand-down" and the external-provisioner will
// act on the objects
AnnMigratedTo = "pv.kubernetes.io/migrated-to"
// AnnStorageProvisioner annotation is added to a PVC that is supposed to be dynamically
// provisioned. Its value is name of volume plugin that is supposed to provision
// a volume for this PVC.
AnnStorageProvisioner = "volume.beta.kubernetes.io/storage-provisioner"
)
// IsDelayBindingProvisioning checks if claim provisioning with selected-node annotation
func IsDelayBindingProvisioning(claim *v1.PersistentVolumeClaim) bool {
// When feature VolumeScheduling enabled,
// Scheduler signal to the PV controller to start dynamic
// provisioning by setting the "AnnSelectedNode" annotation
// in the PVC
_, ok := claim.Annotations[AnnSelectedNode]
return ok
}
// IsDelayBindingMode checks if claim is in delay binding mode.
func IsDelayBindingMode(claim *v1.PersistentVolumeClaim, classLister storagelisters.StorageClassLister) (bool, error) {
className := v1helper.GetPersistentVolumeClaimClass(claim)
if className == "" {
return false, nil
}
class, err := classLister.Get(className)
if err != nil {
if apierrors.IsNotFound(err) {
return false, nil
}
return false, err
}
if class.VolumeBindingMode == nil {
return false, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", className)
}
return *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer, nil
}
// GetBindVolumeToClaim returns a new volume which is bound to given claim. In
// addition, it returns a bool which indicates whether we made modification on
// original volume.
func GetBindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, bool, error) {
dirty := false
// Check if the volume was already bound (either by user or by controller)
shouldSetBoundByController := false
if !IsVolumeBoundToClaim(volume, claim) {
shouldSetBoundByController = true
}
// The volume from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
volumeClone := volume.DeepCopy()
// Bind the volume to the claim if it is not bound yet
if volume.Spec.ClaimRef == nil ||
volume.Spec.ClaimRef.Name != claim.Name ||
volume.Spec.ClaimRef.Namespace != claim.Namespace ||
volume.Spec.ClaimRef.UID != claim.UID {
claimRef, err := reference.GetReference(scheme.Scheme, claim)
if err != nil {
return nil, false, fmt.Errorf("Unexpected error getting claim reference: %v", err)
}
volumeClone.Spec.ClaimRef = claimRef
dirty = true
}
// Set AnnBoundByController if it is not set yet
if shouldSetBoundByController && !metav1.HasAnnotation(volumeClone.ObjectMeta, AnnBoundByController) {
metav1.SetMetaDataAnnotation(&volumeClone.ObjectMeta, AnnBoundByController, "yes")
dirty = true
}
return volumeClone, dirty, nil
}
// IsVolumeBoundToClaim returns true, if given volume is pre-bound or bound
// to specific claim. Both claim.Name and claim.Namespace must be equal.
// If claim.UID is present in volume.Spec.ClaimRef, it must be equal too.
func IsVolumeBoundToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) bool {
if volume.Spec.ClaimRef == nil {
return false
}
if claim.Name != volume.Spec.ClaimRef.Name || claim.Namespace != volume.Spec.ClaimRef.Namespace {
return false
}
if volume.Spec.ClaimRef.UID != "" && claim.UID != volume.Spec.ClaimRef.UID {
return false
}
return true
}
// FindMatchingVolume goes through the list of volumes to find the best matching volume
// for the claim.
//
// This function is used by both the PV controller and scheduler.
//
// delayBinding is true only in the PV controller path. When set, prebound PVs are still returned
// as a match for the claim, but unbound PVs are skipped.
//
// node is set only in the scheduler path. When set, the PV node affinity is checked against
// the node's labels.
//
// excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple
// unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen
// PV needs to be excluded from future matching.
func FindMatchingVolume(
claim *v1.PersistentVolumeClaim,
volumes []*v1.PersistentVolume,
node *v1.Node,
excludedVolumes map[string]*v1.PersistentVolume,
delayBinding bool) (*v1.PersistentVolume, error) {
var smallestVolume *v1.PersistentVolume
var smallestVolumeQty resource.Quantity
requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
requestedClass := v1helper.GetPersistentVolumeClaimClass(claim)
var selector labels.Selector
if claim.Spec.Selector != nil {
internalSelector, err := metav1.LabelSelectorAsSelector(claim.Spec.Selector)
if err != nil {
// should be unreachable code due to validation
return nil, fmt.Errorf("error creating internal label selector for claim: %v: %v", claimToClaimKey(claim), err)
}
selector = internalSelector
}
// Go through all available volumes with two goals:
// - find a volume that is either pre-bound by user or dynamically
// provisioned for this claim. Because of this we need to loop through
// all volumes.
// - find the smallest matching one if there is no volume pre-bound to
// the claim.
for _, volume := range volumes {
if _, ok := excludedVolumes[volume.Name]; ok {
// Skip volumes in the excluded list
continue
}
if volume.Spec.ClaimRef != nil && !IsVolumeBoundToClaim(volume, claim) {
continue
}
volumeQty := volume.Spec.Capacity[v1.ResourceStorage]
if volumeQty.Cmp(requestedQty) < 0 {
continue
}
// filter out mismatching volumeModes
if CheckVolumeModeMismatches(&claim.Spec, &volume.Spec) {
continue
}
// check if PV's DeletionTimeStamp is set, if so, skip this volume.
if utilfeature.DefaultFeatureGate.Enabled(features.StorageObjectInUseProtection) {
if volume.ObjectMeta.DeletionTimestamp != nil {
continue
}
}
nodeAffinityValid := true
if node != nil {
// Scheduler path, check that the PV NodeAffinity
// is satisfied by the node
// volumeutil.CheckNodeAffinity is the most expensive call in this loop.
// We should check cheaper conditions first or consider optimizing this function.
err := volumeutil.CheckNodeAffinity(volume, node.Labels)
if err != nil {
nodeAffinityValid = false
}
}
if IsVolumeBoundToClaim(volume, claim) {
// If PV node affinity is invalid, return no match.
// This means the prebound PV (and therefore PVC)
// is not suitable for this node.
if !nodeAffinityValid {
return nil, nil
}
return volume, nil
}
if node == nil && delayBinding {
// PV controller does not bind this claim.
// Scheduler will handle binding unbound volumes
// Scheduler path will have node != nil
continue
}
// filter out:
// - volumes in non-available phase
// - volumes whose labels don't match the claim's selector, if specified
// - volumes in Class that is not requested
// - volumes whose NodeAffinity does not match the node
if volume.Status.Phase != v1.VolumeAvailable {
// We ignore volumes in non-available phase, because volumes that
// satisfies matching criteria will be updated to available, binding
// them now has high chance of encountering unnecessary failures
// due to API conflicts.
continue
} else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) {
continue
}
if v1helper.GetPersistentVolumeClass(volume) != requestedClass {
continue
}
if !nodeAffinityValid {
continue
}
if node != nil {
// Scheduler path
// Check that the access modes match
if !CheckAccessModes(claim, volume) {
continue
}
}
if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 {
smallestVolume = volume
smallestVolumeQty = volumeQty
}
}
if smallestVolume != nil {
// Found a matching volume
return smallestVolume, nil
}
return nil, nil
}
// CheckVolumeModeMismatches is a convenience method that checks volumeMode for PersistentVolume
// and PersistentVolumeClaims
func CheckVolumeModeMismatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1.PersistentVolumeSpec) bool {
// In HA upgrades, we cannot guarantee that the apiserver is on a version >= controller-manager.
// So we default a nil volumeMode to filesystem
requestedVolumeMode := v1.PersistentVolumeFilesystem
if pvcSpec.VolumeMode != nil {
requestedVolumeMode = *pvcSpec.VolumeMode
}
pvVolumeMode := v1.PersistentVolumeFilesystem
if pvSpec.VolumeMode != nil {
pvVolumeMode = *pvSpec.VolumeMode
}
return requestedVolumeMode != pvVolumeMode
}
// CheckAccessModes returns true if PV satisfies all the PVC's requested AccessModes
func CheckAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool {
pvModesMap := map[v1.PersistentVolumeAccessMode]bool{}
for _, mode := range volume.Spec.AccessModes {
pvModesMap[mode] = true
}
for _, mode := range claim.Spec.AccessModes {
_, ok := pvModesMap[mode]
if !ok {
return false
}
}
return true
}
func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
}
// GetVolumeNodeAffinity returns a VolumeNodeAffinity for given key and value.
func GetVolumeNodeAffinity(key string, value string) *v1.VolumeNodeAffinity {
return &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: key,
Operator: v1.NodeSelectorOpIn,
Values: []string{value},
},
},
},
},
},
}
}

View File

@ -1,87 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"scheduler_assume_cache.go",
"scheduler_binder.go",
"scheduler_binder_cache.go",
"scheduler_binder_fake.go",
],
importpath = "k8s.io/kubernetes/pkg/controller/volume/scheduling",
visibility = ["//visibility:public"],
deps = [
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/controller/volume/persistentvolume/util:go_default_library",
"//pkg/controller/volume/scheduling/metrics:go_default_library",
"//pkg/features:go_default_library",
"//pkg/volume/util:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/storage/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/meta:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/storage/etcd3:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/storage/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/listers/storage/v1:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/csi-translation-lib:go_default_library",
"//staging/src/k8s.io/csi-translation-lib/plugins:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = [
"scheduler_assume_cache_test.go",
"scheduler_binder_cache_test.go",
"scheduler_binder_test.go",
],
embed = [":go_default_library"],
deps = [
"//pkg/controller:go_default_library",
"//pkg/controller/volume/persistentvolume/testing:go_default_library",
"//pkg/controller/volume/persistentvolume/util:go_default_library",
"//pkg/features:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/storage/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/watch:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/informers:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/storage/v1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library",
"//staging/src/k8s.io/client-go/testing:go_default_library",
"//staging/src/k8s.io/component-base/featuregate/testing:go_default_library",
"//vendor/k8s.io/klog:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/controller/volume/scheduling/metrics:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,9 +0,0 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- msau42
- cofyc
reviewers:
- msau42
- cofyc
- lichuqiang

View File

@ -1,26 +0,0 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["metrics.go"],
importpath = "k8s.io/kubernetes/pkg/controller/volume/scheduling/metrics",
visibility = ["//visibility:public"],
deps = [
"//staging/src/k8s.io/component-base/metrics:go_default_library",
"//staging/src/k8s.io/component-base/metrics/legacyregistry:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -1,67 +0,0 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// VolumeSchedulerSubsystem - subsystem name used by scheduler
const VolumeSchedulerSubsystem = "scheduler_volume"
var (
// VolumeBindingRequestSchedulerBinderCache tracks the number of volume binder cache operations.
VolumeBindingRequestSchedulerBinderCache = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "binder_cache_requests_total",
Help: "Total number for request volume binding cache",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
// VolumeSchedulingStageLatency tracks the latency of volume scheduling operations.
VolumeSchedulingStageLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_duration_seconds",
Help: "Volume scheduling stage latency",
Buckets: metrics.ExponentialBuckets(1000, 2, 15),
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
// VolumeSchedulingStageFailed tracks the number of failed volume scheduling operations.
VolumeSchedulingStageFailed = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_stage_error_total",
Help: "Volume scheduling stage error count",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)
// RegisterVolumeSchedulingMetrics is used for scheduler, because the volume binding cache is a library
// used by scheduler process.
func RegisterVolumeSchedulingMetrics() {
legacyregistry.MustRegister(VolumeBindingRequestSchedulerBinderCache)
legacyregistry.MustRegister(VolumeSchedulingStageLatency)
legacyregistry.MustRegister(VolumeSchedulingStageFailed)
}

View File

@ -1,451 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduling
import (
"fmt"
"strconv"
"sync"
"k8s.io/klog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/tools/cache"
)
// AssumeCache is a cache on top of the informer that allows for updating
// objects outside of informer events and also restoring the informer
// cache's version of the object. Objects are assumed to be
// Kubernetes API objects that implement meta.Interface
type AssumeCache interface {
// Assume updates the object in-memory only
Assume(obj interface{}) error
// Restore the informer cache's version of the object
Restore(objName string)
// Get the object by name
Get(objName string) (interface{}, error)
// Get the API object by name
GetAPIObj(objName string) (interface{}, error)
// List all the objects in the cache
List(indexObj interface{}) []interface{}
}
type errWrongType struct {
typeName string
object interface{}
}
func (e *errWrongType) Error() string {
return fmt.Sprintf("could not convert object to type %v: %+v", e.typeName, e.object)
}
type errNotFound struct {
typeName string
objectName string
}
func (e *errNotFound) Error() string {
return fmt.Sprintf("could not find %v %q", e.typeName, e.objectName)
}
type errObjectName struct {
detailedErr error
}
func (e *errObjectName) Error() string {
return fmt.Sprintf("failed to get object name: %v", e.detailedErr)
}
// assumeCache stores two pointers to represent a single object:
// * The pointer to the informer object.
// * The pointer to the latest object, which could be the same as
// the informer object, or an in-memory object.
//
// An informer update always overrides the latest object pointer.
//
// Assume() only updates the latest object pointer.
// Restore() sets the latest object pointer back to the informer object.
// Get/List() always returns the latest object pointer.
type assumeCache struct {
// Synchronizes updates to store
rwMutex sync.RWMutex
// describes the object stored
description string
// Stores objInfo pointers
store cache.Indexer
// Index function for object
indexFunc cache.IndexFunc
indexName string
}
type objInfo struct {
// name of the object
name string
// Latest version of object could be cached-only or from informer
latestObj interface{}
// Latest object from informer
apiObj interface{}
}
func objInfoKeyFunc(obj interface{}) (string, error) {
objInfo, ok := obj.(*objInfo)
if !ok {
return "", &errWrongType{"objInfo", obj}
}
return objInfo.name, nil
}
func (c *assumeCache) objInfoIndexFunc(obj interface{}) ([]string, error) {
objInfo, ok := obj.(*objInfo)
if !ok {
return []string{""}, &errWrongType{"objInfo", obj}
}
return c.indexFunc(objInfo.latestObj)
}
// NewAssumeCache creates an assume cache for general objects.
func NewAssumeCache(informer cache.SharedIndexInformer, description, indexName string, indexFunc cache.IndexFunc) AssumeCache {
c := &assumeCache{
description: description,
indexFunc: indexFunc,
indexName: indexName,
}
c.store = cache.NewIndexer(objInfoKeyFunc, cache.Indexers{indexName: c.objInfoIndexFunc})
// Unit tests don't use informers
if informer != nil {
informer.AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.add,
UpdateFunc: c.update,
DeleteFunc: c.delete,
},
)
}
return c
}
func (c *assumeCache) add(obj interface{}) {
if obj == nil {
return
}
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("add failed: %v", &errObjectName{err})
return
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
if objInfo, _ := c.getObjInfo(name); objInfo != nil {
newVersion, err := c.getObjVersion(name, obj)
if err != nil {
klog.Errorf("add: couldn't get object version: %v", err)
return
}
storedVersion, err := c.getObjVersion(name, objInfo.latestObj)
if err != nil {
klog.Errorf("add: couldn't get stored object version: %v", err)
return
}
// Only update object if version is newer.
// This is so we don't override assumed objects due to informer resync.
if newVersion <= storedVersion {
klog.V(10).Infof("Skip adding %v %v to assume cache because version %v is not newer than %v", c.description, name, newVersion, storedVersion)
return
}
}
objInfo := &objInfo{name: name, latestObj: obj, apiObj: obj}
if err = c.store.Update(objInfo); err != nil {
klog.Warningf("got error when updating stored object : %v", err)
} else {
klog.V(10).Infof("Adding %v %v to assume cache: %+v ", c.description, name, obj)
}
}
func (c *assumeCache) update(oldObj interface{}, newObj interface{}) {
c.add(newObj)
}
func (c *assumeCache) delete(obj interface{}) {
if obj == nil {
return
}
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("delete failed: %v", &errObjectName{err})
return
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo := &objInfo{name: name}
err = c.store.Delete(objInfo)
if err != nil {
klog.Errorf("delete: failed to delete %v %v: %v", c.description, name, err)
}
}
func (c *assumeCache) getObjVersion(name string, obj interface{}) (int64, error) {
objAccessor, err := meta.Accessor(obj)
if err != nil {
return -1, err
}
objResourceVersion, err := strconv.ParseInt(objAccessor.GetResourceVersion(), 10, 64)
if err != nil {
return -1, fmt.Errorf("error parsing ResourceVersion %q for %v %q: %s", objAccessor.GetResourceVersion(), c.description, name, err)
}
return objResourceVersion, nil
}
func (c *assumeCache) getObjInfo(name string) (*objInfo, error) {
obj, ok, err := c.store.GetByKey(name)
if err != nil {
return nil, err
}
if !ok {
return nil, &errNotFound{c.description, name}
}
objInfo, ok := obj.(*objInfo)
if !ok {
return nil, &errWrongType{"objInfo", obj}
}
return objInfo, nil
}
func (c *assumeCache) Get(objName string) (interface{}, error) {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
return nil, err
}
return objInfo.latestObj, nil
}
func (c *assumeCache) GetAPIObj(objName string) (interface{}, error) {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
return nil, err
}
return objInfo.apiObj, nil
}
func (c *assumeCache) List(indexObj interface{}) []interface{} {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
allObjs := []interface{}{}
objs, err := c.store.Index(c.indexName, &objInfo{latestObj: indexObj})
if err != nil {
klog.Errorf("list index error: %v", err)
return nil
}
for _, obj := range objs {
objInfo, ok := obj.(*objInfo)
if !ok {
klog.Errorf("list error: %v", &errWrongType{"objInfo", obj})
continue
}
allObjs = append(allObjs, objInfo.latestObj)
}
return allObjs
}
func (c *assumeCache) Assume(obj interface{}) error {
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
return &errObjectName{err}
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo, err := c.getObjInfo(name)
if err != nil {
return err
}
newVersion, err := c.getObjVersion(name, obj)
if err != nil {
return err
}
storedVersion, err := c.getObjVersion(name, objInfo.latestObj)
if err != nil {
return err
}
if newVersion < storedVersion {
return fmt.Errorf("%v %q is out of sync (stored: %d, assume: %d)", c.description, name, storedVersion, newVersion)
}
// Only update the cached object
objInfo.latestObj = obj
klog.V(4).Infof("Assumed %v %q, version %v", c.description, name, newVersion)
return nil
}
func (c *assumeCache) Restore(objName string) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
// This could be expected if object got deleted
klog.V(5).Infof("Restore %v %q warning: %v", c.description, objName, err)
} else {
objInfo.latestObj = objInfo.apiObj
klog.V(4).Infof("Restored %v %q", c.description, objName)
}
}
// PVAssumeCache is a AssumeCache for PersistentVolume objects
type PVAssumeCache interface {
AssumeCache
GetPV(pvName string) (*v1.PersistentVolume, error)
GetAPIPV(pvName string) (*v1.PersistentVolume, error)
ListPVs(storageClassName string) []*v1.PersistentVolume
}
type pvAssumeCache struct {
AssumeCache
}
func pvStorageClassIndexFunc(obj interface{}) ([]string, error) {
if pv, ok := obj.(*v1.PersistentVolume); ok {
return []string{pv.Spec.StorageClassName}, nil
}
return []string{""}, fmt.Errorf("object is not a v1.PersistentVolume: %v", obj)
}
// NewPVAssumeCache creates a PV assume cache.
func NewPVAssumeCache(informer cache.SharedIndexInformer) PVAssumeCache {
return &pvAssumeCache{NewAssumeCache(informer, "v1.PersistentVolume", "storageclass", pvStorageClassIndexFunc)}
}
func (c *pvAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.Get(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &errWrongType{"v1.PersistentVolume", obj}
}
return pv, nil
}
func (c *pvAssumeCache) GetAPIPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.GetAPIObj(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &errWrongType{"v1.PersistentVolume", obj}
}
return pv, nil
}
func (c *pvAssumeCache) ListPVs(storageClassName string) []*v1.PersistentVolume {
objs := c.List(&v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{
StorageClassName: storageClassName,
},
})
pvs := []*v1.PersistentVolume{}
for _, obj := range objs {
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
klog.Errorf("ListPVs: %v", &errWrongType{"v1.PersistentVolume", obj})
continue
}
pvs = append(pvs, pv)
}
return pvs
}
// PVCAssumeCache is a AssumeCache for PersistentVolumeClaim objects
type PVCAssumeCache interface {
AssumeCache
// GetPVC returns the PVC from the cache with given pvcKey.
// pvcKey is the result of MetaNamespaceKeyFunc on PVC obj
GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error)
GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error)
}
type pvcAssumeCache struct {
AssumeCache
}
// NewPVCAssumeCache creates a PVC assume cache.
func NewPVCAssumeCache(informer cache.SharedIndexInformer) PVCAssumeCache {
return &pvcAssumeCache{NewAssumeCache(informer, "v1.PersistentVolumeClaim", "namespace", cache.MetaNamespaceIndexFunc)}
}
func (c *pvcAssumeCache) GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.Get(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &errWrongType{"v1.PersistentVolumeClaim", obj}
}
return pvc, nil
}
func (c *pvcAssumeCache) GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.GetAPIObj(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &errWrongType{"v1.PersistentVolumeClaim", obj}
}
return pvc, nil
}

View File

@ -1,948 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduling
import (
"context"
"fmt"
"sort"
"strings"
"time"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/storage/etcd3"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
clientset "k8s.io/client-go/kubernetes"
storagelisters "k8s.io/client-go/listers/storage/v1"
csitrans "k8s.io/csi-translation-lib"
csiplugins "k8s.io/csi-translation-lib/plugins"
"k8s.io/klog"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
pvutil "k8s.io/kubernetes/pkg/controller/volume/persistentvolume/util"
"k8s.io/kubernetes/pkg/controller/volume/scheduling/metrics"
"k8s.io/kubernetes/pkg/features"
volumeutil "k8s.io/kubernetes/pkg/volume/util"
)
// ConflictReason is used for the special strings which explain why
// volume binding is impossible for a node.
type ConflictReason string
// ConflictReasons contains all reasons that explain why volume binding is impossible for a node.
type ConflictReasons []ConflictReason
func (reasons ConflictReasons) Len() int { return len(reasons) }
func (reasons ConflictReasons) Less(i, j int) bool { return reasons[i] < reasons[j] }
func (reasons ConflictReasons) Swap(i, j int) { reasons[i], reasons[j] = reasons[j], reasons[i] }
const (
// ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error.
ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind"
// ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error.
ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict"
)
// InTreeToCSITranslator contains methods required to check migratable status
// and perform translations from InTree PV's to CSI
type InTreeToCSITranslator interface {
IsPVMigratable(pv *v1.PersistentVolume) bool
GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
}
// SchedulerVolumeBinder is used by the scheduler to handle PVC/PV binding
// and dynamic provisioning. The binding decisions are integrated into the pod scheduling
// workflow so that the PV NodeAffinity is also considered along with the pod's other
// scheduling requirements.
//
// This integrates into the existing default scheduler workflow as follows:
// 1. The scheduler takes a Pod off the scheduler queue and processes it serially:
// a. Invokes all predicate functions, parallelized across nodes. FindPodVolumes() is invoked here.
// b. Invokes all priority functions. Future/TBD
// c. Selects the best node for the Pod.
// d. Cache the node selection for the Pod. AssumePodVolumes() is invoked here.
// i. If PVC binding is required, cache in-memory only:
// * For manual binding: update PV objects for prebinding to the corresponding PVCs.
// * For dynamic provisioning: update PVC object with a selected node from c)
// * For the pod, which PVCs and PVs need API updates.
// ii. Afterwards, the main scheduler caches the Pod->Node binding in the scheduler's pod cache,
// This is handled in the scheduler and not here.
// e. Asynchronously bind volumes and pod in a separate goroutine
// i. BindPodVolumes() is called first. It makes all the necessary API updates and waits for
// PV controller to fully bind and provision the PVCs. If binding fails, the Pod is sent
// back through the scheduler.
// ii. After BindPodVolumes() is complete, then the scheduler does the final Pod->Node binding.
// 2. Once all the assume operations are done in d), the scheduler processes the next Pod in the scheduler queue
// while the actual binding operation occurs in the background.
type SchedulerVolumeBinder interface {
// FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the node.
//
// If a PVC is bound, it checks if the PV's NodeAffinity matches the Node.
// Otherwise, it tries to find an available PV to bind to the PVC.
//
// It returns an error when something went wrong or a list of reasons why the node is
// (currently) not usable for the pod.
//
// This function is called by the volume binding scheduler predicate and can be called in parallel
FindPodVolumes(pod *v1.Pod, node *v1.Node) (reasons ConflictReasons, err error)
// AssumePodVolumes will:
// 1. Take the PV matches for unbound PVCs and update the PV cache assuming
// that the PV is prebound to the PVC.
// 2. Take the PVCs that need provisioning and update the PVC cache with related
// annotations set.
//
// It returns true if all volumes are fully bound
//
// This function will modify assumedPod with the node name.
// This function is called serially.
AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, err error)
// BindPodVolumes will:
// 1. Initiate the volume binding by making the API call to prebind the PV
// to its matching PVC.
// 2. Trigger the volume provisioning by making the API call to set related
// annotations on the PVC
// 3. Wait for PVCs to be completely bound by the PV controller
//
// This function can be called in parallel.
BindPodVolumes(assumedPod *v1.Pod) error
// GetBindingsCache returns the cache used (if any) to store volume binding decisions.
GetBindingsCache() PodBindingCache
// DeletePodBindings will delete pod's bindingDecisions in podBindingCache.
DeletePodBindings(pod *v1.Pod)
}
type volumeBinder struct {
kubeClient clientset.Interface
classLister storagelisters.StorageClassLister
nodeInformer coreinformers.NodeInformer
csiNodeInformer storageinformers.CSINodeInformer
pvcCache PVCAssumeCache
pvCache PVAssumeCache
// Stores binding decisions that were made in FindPodVolumes for use in AssumePodVolumes.
// AssumePodVolumes modifies the bindings again for use in BindPodVolumes.
podBindingCache PodBindingCache
// Amount of time to wait for the bind operation to succeed
bindTimeout time.Duration
translator InTreeToCSITranslator
}
// NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions.
func NewVolumeBinder(
kubeClient clientset.Interface,
nodeInformer coreinformers.NodeInformer,
csiNodeInformer storageinformers.CSINodeInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
pvInformer coreinformers.PersistentVolumeInformer,
storageClassInformer storageinformers.StorageClassInformer,
bindTimeout time.Duration) SchedulerVolumeBinder {
b := &volumeBinder{
kubeClient: kubeClient,
classLister: storageClassInformer.Lister(),
nodeInformer: nodeInformer,
csiNodeInformer: csiNodeInformer,
pvcCache: NewPVCAssumeCache(pvcInformer.Informer()),
pvCache: NewPVAssumeCache(pvInformer.Informer()),
podBindingCache: NewPodBindingCache(),
bindTimeout: bindTimeout,
translator: csitrans.New(),
}
return b
}
func (b *volumeBinder) GetBindingsCache() PodBindingCache {
return b.podBindingCache
}
// DeletePodBindings will delete pod's bindingDecisions in podBindingCache.
func (b *volumeBinder) DeletePodBindings(pod *v1.Pod) {
cache := b.podBindingCache
if pod != nil {
cache.DeleteBindings(pod)
}
}
// FindPodVolumes caches the matching PVs and PVCs to provision per node in podBindingCache.
// This method intentionally takes in a *v1.Node object instead of using volumebinder.nodeInformer.
// That's necessary because some operations will need to pass in to the predicate fake node objects.
func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, node *v1.Node) (reasons ConflictReasons, err error) {
podName := getPodName(pod)
// Warning: Below log needs high verbosity as it can be printed several times (#60933).
klog.V(5).Infof("FindPodVolumes for pod %q, node %q", podName, node.Name)
// Initialize to true for pods that don't have volumes. These
// booleans get translated into reason strings when the function
// returns without an error.
unboundVolumesSatisfied := true
boundVolumesSatisfied := true
defer func() {
if err != nil {
return
}
if !boundVolumesSatisfied {
reasons = append(reasons, ErrReasonNodeConflict)
}
if !unboundVolumesSatisfied {
reasons = append(reasons, ErrReasonBindConflict)
}
}()
start := time.Now()
defer func() {
metrics.VolumeSchedulingStageLatency.WithLabelValues("predicate").Observe(time.Since(start).Seconds())
if err != nil {
metrics.VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc()
}
}()
var (
matchedBindings []*bindingInfo
provisionedClaims []*v1.PersistentVolumeClaim
)
defer func() {
// We recreate bindings for each new schedule loop.
if len(matchedBindings) == 0 && len(provisionedClaims) == 0 {
// Clear cache if no claims to bind or provision for this node.
b.podBindingCache.ClearBindings(pod, node.Name)
return
}
// Although we do not distinguish nil from empty in this function, for
// easier testing, we normalize empty to nil.
if len(matchedBindings) == 0 {
matchedBindings = nil
}
if len(provisionedClaims) == 0 {
provisionedClaims = nil
}
// Mark cache with all matched and provisioned claims for this node
b.podBindingCache.UpdateBindings(pod, node.Name, matchedBindings, provisionedClaims)
}()
// The pod's volumes need to be processed in one call to avoid the race condition where
// volumes can get bound/provisioned in between calls.
boundClaims, claimsToBind, unboundClaimsImmediate, err := b.getPodVolumes(pod)
if err != nil {
return nil, err
}
// Immediate claims should be bound
if len(unboundClaimsImmediate) > 0 {
return nil, fmt.Errorf("pod has unbound immediate PersistentVolumeClaims")
}
// Check PV node affinity on bound volumes
if len(boundClaims) > 0 {
boundVolumesSatisfied, err = b.checkBoundClaims(boundClaims, node, podName)
if err != nil {
return nil, err
}
}
// Find matching volumes and node for unbound claims
if len(claimsToBind) > 0 {
var (
claimsToFindMatching []*v1.PersistentVolumeClaim
claimsToProvision []*v1.PersistentVolumeClaim
)
// Filter out claims to provision
for _, claim := range claimsToBind {
if selectedNode, ok := claim.Annotations[pvutil.AnnSelectedNode]; ok {
if selectedNode != node.Name {
// Fast path, skip unmatched node.
unboundVolumesSatisfied = false
return
}
claimsToProvision = append(claimsToProvision, claim)
} else {
claimsToFindMatching = append(claimsToFindMatching, claim)
}
}
// Find matching volumes
if len(claimsToFindMatching) > 0 {
var unboundClaims []*v1.PersistentVolumeClaim
unboundVolumesSatisfied, matchedBindings, unboundClaims, err = b.findMatchingVolumes(pod, claimsToFindMatching, node)
if err != nil {
return nil, err
}
claimsToProvision = append(claimsToProvision, unboundClaims...)
}
// Check for claims to provision
if len(claimsToProvision) > 0 {
unboundVolumesSatisfied, provisionedClaims, err = b.checkVolumeProvisions(pod, claimsToProvision, node)
if err != nil {
return nil, err
}
}
}
return
}
// AssumePodVolumes will take the cached matching PVs and PVCs to provision
// in podBindingCache for the chosen node, and:
// 1. Update the pvCache with the new prebound PV.
// 2. Update the pvcCache with the new PVCs with annotations set
// 3. Update podBindingCache again with cached API updates for PVs and PVCs.
func (b *volumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, err error) {
podName := getPodName(assumedPod)
klog.V(4).Infof("AssumePodVolumes for pod %q, node %q", podName, nodeName)
start := time.Now()
defer func() {
metrics.VolumeSchedulingStageLatency.WithLabelValues("assume").Observe(time.Since(start).Seconds())
if err != nil {
metrics.VolumeSchedulingStageFailed.WithLabelValues("assume").Inc()
}
}()
if allBound := b.arePodVolumesBound(assumedPod); allBound {
klog.V(4).Infof("AssumePodVolumes for pod %q, node %q: all PVCs bound and nothing to do", podName, nodeName)
return true, nil
}
assumedPod.Spec.NodeName = nodeName
claimsToBind := b.podBindingCache.GetBindings(assumedPod, nodeName)
claimsToProvision := b.podBindingCache.GetProvisionedPVCs(assumedPod, nodeName)
// Assume PV
newBindings := []*bindingInfo{}
for _, binding := range claimsToBind {
newPV, dirty, err := pvutil.GetBindVolumeToClaim(binding.pv, binding.pvc)
klog.V(5).Infof("AssumePodVolumes: GetBindVolumeToClaim for pod %q, PV %q, PVC %q. newPV %p, dirty %v, err: %v",
podName,
binding.pv.Name,
binding.pvc.Name,
newPV,
dirty,
err)
if err != nil {
b.revertAssumedPVs(newBindings)
return false, err
}
// TODO: can we assume everytime?
if dirty {
err = b.pvCache.Assume(newPV)
if err != nil {
b.revertAssumedPVs(newBindings)
return false, err
}
}
newBindings = append(newBindings, &bindingInfo{pv: newPV, pvc: binding.pvc})
}
// Assume PVCs
newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
for _, claim := range claimsToProvision {
// The claims from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
claimClone := claim.DeepCopy()
metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, pvutil.AnnSelectedNode, nodeName)
err = b.pvcCache.Assume(claimClone)
if err != nil {
b.revertAssumedPVs(newBindings)
b.revertAssumedPVCs(newProvisionedPVCs)
return
}
newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
}
// Update cache with the assumed pvcs and pvs
// Even if length is zero, update the cache with an empty slice to indicate that no
// operations are needed
b.podBindingCache.UpdateBindings(assumedPod, nodeName, newBindings, newProvisionedPVCs)
return
}
// BindPodVolumes gets the cached bindings and PVCs to provision in podBindingCache,
// makes the API update for those PVs/PVCs, and waits for the PVCs to be completely bound
// by the PV controller.
func (b *volumeBinder) BindPodVolumes(assumedPod *v1.Pod) (err error) {
podName := getPodName(assumedPod)
klog.V(4).Infof("BindPodVolumes for pod %q, node %q", podName, assumedPod.Spec.NodeName)
start := time.Now()
defer func() {
metrics.VolumeSchedulingStageLatency.WithLabelValues("bind").Observe(time.Since(start).Seconds())
if err != nil {
metrics.VolumeSchedulingStageFailed.WithLabelValues("bind").Inc()
}
}()
bindings := b.podBindingCache.GetBindings(assumedPod, assumedPod.Spec.NodeName)
claimsToProvision := b.podBindingCache.GetProvisionedPVCs(assumedPod, assumedPod.Spec.NodeName)
// Start API operations
err = b.bindAPIUpdate(podName, bindings, claimsToProvision)
if err != nil {
return err
}
err = wait.Poll(time.Second, b.bindTimeout, func() (bool, error) {
b, err := b.checkBindings(assumedPod, bindings, claimsToProvision)
return b, err
})
if err != nil {
return fmt.Errorf("Failed to bind volumes: %v", err)
}
return nil
}
func getPodName(pod *v1.Pod) string {
return pod.Namespace + "/" + pod.Name
}
func getPVCName(pvc *v1.PersistentVolumeClaim) string {
return pvc.Namespace + "/" + pvc.Name
}
// bindAPIUpdate gets the cached bindings and PVCs to provision in podBindingCache
// and makes the API update for those PVs/PVCs.
func (b *volumeBinder) bindAPIUpdate(podName string, bindings []*bindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) error {
if bindings == nil {
return fmt.Errorf("failed to get cached bindings for pod %q", podName)
}
if claimsToProvision == nil {
return fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
}
lastProcessedBinding := 0
lastProcessedProvisioning := 0
defer func() {
// only revert assumed cached updates for volumes we haven't successfully bound
if lastProcessedBinding < len(bindings) {
b.revertAssumedPVs(bindings[lastProcessedBinding:])
}
// only revert assumed cached updates for claims we haven't updated,
if lastProcessedProvisioning < len(claimsToProvision) {
b.revertAssumedPVCs(claimsToProvision[lastProcessedProvisioning:])
}
}()
var (
binding *bindingInfo
i int
claim *v1.PersistentVolumeClaim
)
// Do the actual prebinding. Let the PV controller take care of the rest
// There is no API rollback if the actual binding fails
for _, binding = range bindings {
klog.V(5).Infof("bindAPIUpdate: Pod %q, binding PV %q to PVC %q", podName, binding.pv.Name, binding.pvc.Name)
// TODO: does it hurt if we make an api call and nothing needs to be updated?
claimKey := claimToClaimKey(binding.pvc)
klog.V(2).Infof("claim %q bound to volume %q", claimKey, binding.pv.Name)
newPV, err := b.kubeClient.CoreV1().PersistentVolumes().Update(context.TODO(), binding.pv, metav1.UpdateOptions{})
if err != nil {
klog.V(4).Infof("updating PersistentVolume[%s]: binding to %q failed: %v", binding.pv.Name, claimKey, err)
return err
}
klog.V(4).Infof("updating PersistentVolume[%s]: bound to %q", binding.pv.Name, claimKey)
// Save updated object from apiserver for later checking.
binding.pv = newPV
lastProcessedBinding++
}
// Update claims objects to trigger volume provisioning. Let the PV controller take care of the rest
// PV controller is expect to signal back by removing related annotations if actual provisioning fails
for i, claim = range claimsToProvision {
klog.V(5).Infof("bindAPIUpdate: Pod %q, PVC %q", podName, getPVCName(claim))
newClaim, err := b.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(context.TODO(), claim, metav1.UpdateOptions{})
if err != nil {
return err
}
// Save updated object from apiserver for later checking.
claimsToProvision[i] = newClaim
lastProcessedProvisioning++
}
return nil
}
var (
versioner = etcd3.APIObjectVersioner{}
)
// checkBindings runs through all the PVCs in the Pod and checks:
// * if the PVC is fully bound
// * if there are any conditions that require binding to fail and be retried
//
// It returns true when all of the Pod's PVCs are fully bound, and error if
// binding (and scheduling) needs to be retried
// Note that it checks on API objects not PV/PVC cache, this is because
// PV/PVC cache can be assumed again in main scheduler loop, we must check
// latest state in API server which are shared with PV controller and
// provisioners
func (b *volumeBinder) checkBindings(pod *v1.Pod, bindings []*bindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) (bool, error) {
podName := getPodName(pod)
if bindings == nil {
return false, fmt.Errorf("failed to get cached bindings for pod %q", podName)
}
if claimsToProvision == nil {
return false, fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
}
node, err := b.nodeInformer.Lister().Get(pod.Spec.NodeName)
if err != nil {
return false, fmt.Errorf("failed to get node %q: %v", pod.Spec.NodeName, err)
}
csiNode, err := b.csiNodeInformer.Lister().Get(node.Name)
if err != nil {
// TODO: return the error once CSINode is created by default
klog.V(4).Infof("Could not get a CSINode object for the node %q: %v", node.Name, err)
}
// Check for any conditions that might require scheduling retry
// When pod is removed from scheduling queue because of deletion or any
// other reasons, binding operation should be cancelled. There is no need
// to check PV/PVC bindings any more.
// We check pod binding cache here which will be cleared when pod is
// removed from scheduling queue.
if b.podBindingCache.GetDecisions(pod) == nil {
return false, fmt.Errorf("pod %q does not exist any more", podName)
}
for _, binding := range bindings {
pv, err := b.pvCache.GetAPIPV(binding.pv.Name)
if err != nil {
return false, fmt.Errorf("failed to check binding: %v", err)
}
pvc, err := b.pvcCache.GetAPIPVC(getPVCName(binding.pvc))
if err != nil {
return false, fmt.Errorf("failed to check binding: %v", err)
}
// Because we updated PV in apiserver, skip if API object is older
// and wait for new API object propagated from apiserver.
if versioner.CompareResourceVersion(binding.pv, pv) > 0 {
return false, nil
}
pv, err = b.tryTranslatePVToCSI(pv, csiNode)
if err != nil {
return false, fmt.Errorf("failed to translate pv to csi: %v", err)
}
// Check PV's node affinity (the node might not have the proper label)
if err := volumeutil.CheckNodeAffinity(pv, node.Labels); err != nil {
return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %v", pv.Name, node.Name, err)
}
// Check if pv.ClaimRef got dropped by unbindVolume()
if pv.Spec.ClaimRef == nil || pv.Spec.ClaimRef.UID == "" {
return false, fmt.Errorf("ClaimRef got reset for pv %q", pv.Name)
}
// Check if pvc is fully bound
if !b.isPVCFullyBound(pvc) {
return false, nil
}
}
for _, claim := range claimsToProvision {
pvc, err := b.pvcCache.GetAPIPVC(getPVCName(claim))
if err != nil {
return false, fmt.Errorf("failed to check provisioning pvc: %v", err)
}
// Because we updated PVC in apiserver, skip if API object is older
// and wait for new API object propagated from apiserver.
if versioner.CompareResourceVersion(claim, pvc) > 0 {
return false, nil
}
// Check if selectedNode annotation is still set
if pvc.Annotations == nil {
return false, fmt.Errorf("selectedNode annotation reset for PVC %q", pvc.Name)
}
selectedNode := pvc.Annotations[pvutil.AnnSelectedNode]
if selectedNode != pod.Spec.NodeName {
// If provisioner fails to provision a volume, selectedNode
// annotation will be removed to signal back to the scheduler to
// retry.
return false, fmt.Errorf("provisioning failed for PVC %q", pvc.Name)
}
// If the PVC is bound to a PV, check its node affinity
if pvc.Spec.VolumeName != "" {
pv, err := b.pvCache.GetAPIPV(pvc.Spec.VolumeName)
if err != nil {
if _, ok := err.(*errNotFound); ok {
// We tolerate NotFound error here, because PV is possibly
// not found because of API delay, we can check next time.
// And if PV does not exist because it's deleted, PVC will
// be unbound eventually.
return false, nil
}
return false, fmt.Errorf("failed to get pv %q from cache: %v", pvc.Spec.VolumeName, err)
}
pv, err = b.tryTranslatePVToCSI(pv, csiNode)
if err != nil {
return false, err
}
if err := volumeutil.CheckNodeAffinity(pv, node.Labels); err != nil {
return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %v", pv.Name, node.Name, err)
}
}
// Check if pvc is fully bound
if !b.isPVCFullyBound(pvc) {
return false, nil
}
}
// All pvs and pvcs that we operated on are bound
klog.V(4).Infof("All PVCs for pod %q are bound", podName)
return true, nil
}
func (b *volumeBinder) isVolumeBound(namespace string, vol *v1.Volume) (bool, *v1.PersistentVolumeClaim, error) {
if vol.PersistentVolumeClaim == nil {
return true, nil, nil
}
pvcName := vol.PersistentVolumeClaim.ClaimName
return b.isPVCBound(namespace, pvcName)
}
func (b *volumeBinder) isPVCBound(namespace, pvcName string) (bool, *v1.PersistentVolumeClaim, error) {
claim := &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: pvcName,
Namespace: namespace,
},
}
pvcKey := getPVCName(claim)
pvc, err := b.pvcCache.GetPVC(pvcKey)
if err != nil || pvc == nil {
return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcKey, err)
}
fullyBound := b.isPVCFullyBound(pvc)
if fullyBound {
klog.V(5).Infof("PVC %q is fully bound to PV %q", pvcKey, pvc.Spec.VolumeName)
} else {
if pvc.Spec.VolumeName != "" {
klog.V(5).Infof("PVC %q is not fully bound to PV %q", pvcKey, pvc.Spec.VolumeName)
} else {
klog.V(5).Infof("PVC %q is not bound", pvcKey)
}
}
return fullyBound, pvc, nil
}
func (b *volumeBinder) isPVCFullyBound(pvc *v1.PersistentVolumeClaim) bool {
return pvc.Spec.VolumeName != "" && metav1.HasAnnotation(pvc.ObjectMeta, pvutil.AnnBindCompleted)
}
// arePodVolumesBound returns true if all volumes are fully bound
func (b *volumeBinder) arePodVolumesBound(pod *v1.Pod) bool {
for _, vol := range pod.Spec.Volumes {
if isBound, _, _ := b.isVolumeBound(pod.Namespace, &vol); !isBound {
// Pod has at least one PVC that needs binding
return false
}
}
return true
}
// getPodVolumes returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning)
// and unbound with immediate binding (including prebound)
func (b *volumeBinder) getPodVolumes(pod *v1.Pod) (boundClaims []*v1.PersistentVolumeClaim, unboundClaimsDelayBinding []*v1.PersistentVolumeClaim, unboundClaimsImmediate []*v1.PersistentVolumeClaim, err error) {
boundClaims = []*v1.PersistentVolumeClaim{}
unboundClaimsImmediate = []*v1.PersistentVolumeClaim{}
unboundClaimsDelayBinding = []*v1.PersistentVolumeClaim{}
for _, vol := range pod.Spec.Volumes {
volumeBound, pvc, err := b.isVolumeBound(pod.Namespace, &vol)
if err != nil {
return nil, nil, nil, err
}
if pvc == nil {
continue
}
if volumeBound {
boundClaims = append(boundClaims, pvc)
} else {
delayBindingMode, err := pvutil.IsDelayBindingMode(pvc, b.classLister)
if err != nil {
return nil, nil, nil, err
}
// Prebound PVCs are treated as unbound immediate binding
if delayBindingMode && pvc.Spec.VolumeName == "" {
// Scheduler path
unboundClaimsDelayBinding = append(unboundClaimsDelayBinding, pvc)
} else {
// !delayBindingMode || pvc.Spec.VolumeName != ""
// Immediate binding should have already been bound
unboundClaimsImmediate = append(unboundClaimsImmediate, pvc)
}
}
}
return boundClaims, unboundClaimsDelayBinding, unboundClaimsImmediate, nil
}
func (b *volumeBinder) checkBoundClaims(claims []*v1.PersistentVolumeClaim, node *v1.Node, podName string) (bool, error) {
csiNode, err := b.csiNodeInformer.Lister().Get(node.Name)
if err != nil {
// TODO: return the error once CSINode is created by default
klog.V(4).Infof("Could not get a CSINode object for the node %q: %v", node.Name, err)
}
for _, pvc := range claims {
pvName := pvc.Spec.VolumeName
pv, err := b.pvCache.GetPV(pvName)
if err != nil {
return false, err
}
pv, err = b.tryTranslatePVToCSI(pv, csiNode)
if err != nil {
return false, err
}
err = volumeutil.CheckNodeAffinity(pv, node.Labels)
if err != nil {
klog.V(4).Infof("PersistentVolume %q, Node %q mismatch for Pod %q: %v", pvName, node.Name, podName, err)
return false, nil
}
klog.V(5).Infof("PersistentVolume %q, Node %q matches for Pod %q", pvName, node.Name, podName)
}
klog.V(4).Infof("All bound volumes for Pod %q match with Node %q", podName, node.Name)
return true, nil
}
// findMatchingVolumes tries to find matching volumes for given claims,
// and return unbound claims for further provision.
func (b *volumeBinder) findMatchingVolumes(pod *v1.Pod, claimsToBind []*v1.PersistentVolumeClaim, node *v1.Node) (foundMatches bool, bindings []*bindingInfo, unboundClaims []*v1.PersistentVolumeClaim, err error) {
podName := getPodName(pod)
// Sort all the claims by increasing size request to get the smallest fits
sort.Sort(byPVCSize(claimsToBind))
chosenPVs := map[string]*v1.PersistentVolume{}
foundMatches = true
for _, pvc := range claimsToBind {
// Get storage class name from each PVC
storageClassName := v1helper.GetPersistentVolumeClaimClass(pvc)
allPVs := b.pvCache.ListPVs(storageClassName)
pvcName := getPVCName(pvc)
// Find a matching PV
pv, err := pvutil.FindMatchingVolume(pvc, allPVs, node, chosenPVs, true)
if err != nil {
return false, nil, nil, err
}
if pv == nil {
klog.V(4).Infof("No matching volumes for Pod %q, PVC %q on node %q", podName, pvcName, node.Name)
unboundClaims = append(unboundClaims, pvc)
foundMatches = false
continue
}
// matching PV needs to be excluded so we don't select it again
chosenPVs[pv.Name] = pv
bindings = append(bindings, &bindingInfo{pv: pv, pvc: pvc})
klog.V(5).Infof("Found matching PV %q for PVC %q on node %q for pod %q", pv.Name, pvcName, node.Name, podName)
}
if foundMatches {
klog.V(4).Infof("Found matching volumes for pod %q on node %q", podName, node.Name)
}
return
}
// checkVolumeProvisions checks given unbound claims (the claims have gone through func
// findMatchingVolumes, and do not have matching volumes for binding), and return true
// if all of the claims are eligible for dynamic provision.
func (b *volumeBinder) checkVolumeProvisions(pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied bool, provisionedClaims []*v1.PersistentVolumeClaim, err error) {
podName := getPodName(pod)
provisionedClaims = []*v1.PersistentVolumeClaim{}
for _, claim := range claimsToProvision {
pvcName := getPVCName(claim)
className := v1helper.GetPersistentVolumeClaimClass(claim)
if className == "" {
return false, nil, fmt.Errorf("no class for claim %q", pvcName)
}
class, err := b.classLister.Get(className)
if err != nil {
return false, nil, fmt.Errorf("failed to find storage class %q", className)
}
provisioner := class.Provisioner
if provisioner == "" || provisioner == pvutil.NotSupportedProvisioner {
klog.V(4).Infof("storage class %q of claim %q does not support dynamic provisioning", className, pvcName)
return false, nil, nil
}
// Check if the node can satisfy the topology requirement in the class
if !v1helper.MatchTopologySelectorTerms(class.AllowedTopologies, labels.Set(node.Labels)) {
klog.V(4).Infof("Node %q cannot satisfy provisioning topology requirements of claim %q", node.Name, pvcName)
return false, nil, nil
}
// TODO: Check if capacity of the node domain in the storage class
// can satisfy resource requirement of given claim
provisionedClaims = append(provisionedClaims, claim)
}
klog.V(4).Infof("Provisioning for claims of pod %q that has no matching volumes on node %q ...", podName, node.Name)
return true, provisionedClaims, nil
}
func (b *volumeBinder) revertAssumedPVs(bindings []*bindingInfo) {
for _, bindingInfo := range bindings {
b.pvCache.Restore(bindingInfo.pv.Name)
}
}
func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
for _, claim := range claims {
b.pvcCache.Restore(getPVCName(claim))
}
}
type bindingInfo struct {
// Claim that needs to be bound
pvc *v1.PersistentVolumeClaim
// Proposed PV to bind to this claim
pv *v1.PersistentVolume
}
type byPVCSize []*v1.PersistentVolumeClaim
func (a byPVCSize) Len() int {
return len(a)
}
func (a byPVCSize) Swap(i, j int) {
a[i], a[j] = a[j], a[i]
}
func (a byPVCSize) Less(i, j int) bool {
iSize := a[i].Spec.Resources.Requests[v1.ResourceStorage]
jSize := a[j].Spec.Resources.Requests[v1.ResourceStorage]
// return true if iSize is less than jSize
return iSize.Cmp(jSize) == -1
}
func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
}
// isCSIMigrationOnForPlugin checks if CSI migrartion is enabled for a given plugin.
func isCSIMigrationOnForPlugin(pluginName string) bool {
switch pluginName {
case csiplugins.AWSEBSInTreePluginName:
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationAWS)
case csiplugins.GCEPDInTreePluginName:
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationGCE)
case csiplugins.AzureDiskInTreePluginName:
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationAzureDisk)
case csiplugins.CinderInTreePluginName:
return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationOpenStack)
}
return false
}
// isPluginMigratedToCSIOnNode checks if an in-tree plugin has been migrated to a CSI driver on the node.
func isPluginMigratedToCSIOnNode(pluginName string, csiNode *storagev1.CSINode) bool {
if csiNode == nil {
return false
}
csiNodeAnn := csiNode.GetAnnotations()
if csiNodeAnn == nil {
return false
}
var mpaSet sets.String
mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
if len(mpa) == 0 {
mpaSet = sets.NewString()
} else {
tok := strings.Split(mpa, ",")
mpaSet = sets.NewString(tok...)
}
return mpaSet.Has(pluginName)
}
// tryTranslatePVToCSI will translate the in-tree PV to CSI if it meets the criteria. If not, it returns the unmodified in-tree PV.
func (b *volumeBinder) tryTranslatePVToCSI(pv *v1.PersistentVolume, csiNode *storagev1.CSINode) (*v1.PersistentVolume, error) {
if !b.translator.IsPVMigratable(pv) {
return pv, nil
}
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigration) {
return pv, nil
}
pluginName, err := b.translator.GetInTreePluginNameFromSpec(pv, nil)
if err != nil {
return nil, fmt.Errorf("could not get plugin name from pv: %v", err)
}
if !isCSIMigrationOnForPlugin(pluginName) {
return pv, nil
}
if !isPluginMigratedToCSIOnNode(pluginName, csiNode) {
return pv, nil
}
transPV, err := b.translator.TranslateInTreePVToCSI(pv)
if err != nil {
return nil, fmt.Errorf("could not translate pv: %v", err)
}
return transPV, nil
}

View File

@ -1,167 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduling
import (
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/controller/volume/scheduling/metrics"
)
// PodBindingCache stores PV binding decisions per pod per node.
// Pod entries are removed when the Pod is deleted or updated to
// no longer be schedulable.
type PodBindingCache interface {
// UpdateBindings will update the cache with the given bindings for the
// pod and node.
UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo, provisionings []*v1.PersistentVolumeClaim)
// ClearBindings will clear the cached bindings for the given pod and node.
ClearBindings(pod *v1.Pod, node string)
// GetBindings will return the cached bindings for the given pod and node.
// A nil return value means that the entry was not found. An empty slice
// means that no binding operations are needed.
GetBindings(pod *v1.Pod, node string) []*bindingInfo
// A nil return value means that the entry was not found. An empty slice
// means that no provisioning operations are needed.
GetProvisionedPVCs(pod *v1.Pod, node string) []*v1.PersistentVolumeClaim
// GetDecisions will return all cached decisions for the given pod.
GetDecisions(pod *v1.Pod) nodeDecisions
// DeleteBindings will remove all cached bindings and provisionings for the given pod.
// TODO: separate the func if it is needed to delete bindings/provisionings individually
DeleteBindings(pod *v1.Pod)
}
type podBindingCache struct {
// synchronizes bindingDecisions
rwMutex sync.RWMutex
// Key = pod name
// Value = nodeDecisions
bindingDecisions map[string]nodeDecisions
}
// Key = nodeName
// Value = bindings & provisioned PVCs of the node
type nodeDecisions map[string]nodeDecision
// A decision includes bindingInfo and provisioned PVCs of the node
type nodeDecision struct {
bindings []*bindingInfo
provisionings []*v1.PersistentVolumeClaim
}
// NewPodBindingCache creates a pod binding cache.
func NewPodBindingCache() PodBindingCache {
return &podBindingCache{bindingDecisions: map[string]nodeDecisions{}}
}
func (c *podBindingCache) GetDecisions(pod *v1.Pod) nodeDecisions {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
return decisions
}
func (c *podBindingCache) DeleteBindings(pod *v1.Pod) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
if _, ok := c.bindingDecisions[podName]; ok {
delete(c.bindingDecisions, podName)
metrics.VolumeBindingRequestSchedulerBinderCache.WithLabelValues("delete").Inc()
}
}
func (c *podBindingCache) UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo, pvcs []*v1.PersistentVolumeClaim) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
decisions = nodeDecisions{}
c.bindingDecisions[podName] = decisions
}
decision, ok := decisions[node]
if !ok {
decision = nodeDecision{
bindings: bindings,
provisionings: pvcs,
}
metrics.VolumeBindingRequestSchedulerBinderCache.WithLabelValues("add").Inc()
} else {
decision.bindings = bindings
decision.provisionings = pvcs
}
decisions[node] = decision
}
func (c *podBindingCache) GetBindings(pod *v1.Pod, node string) []*bindingInfo {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
decision, ok := decisions[node]
if !ok {
return nil
}
return decision.bindings
}
func (c *podBindingCache) GetProvisionedPVCs(pod *v1.Pod, node string) []*v1.PersistentVolumeClaim {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
decision, ok := decisions[node]
if !ok {
return nil
}
return decision.provisionings
}
func (c *podBindingCache) ClearBindings(pod *v1.Pod, node string) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return
}
delete(decisions, node)
}

View File

@ -1,68 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduling
import "k8s.io/api/core/v1"
// FakeVolumeBinderConfig holds configurations for fake volume binder.
type FakeVolumeBinderConfig struct {
AllBound bool
FindReasons ConflictReasons
FindErr error
AssumeErr error
BindErr error
}
// NewFakeVolumeBinder sets up all the caches needed for the scheduler to make
// topology-aware volume binding decisions.
func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder {
return &FakeVolumeBinder{
config: config,
}
}
// FakeVolumeBinder represents a fake volume binder for testing.
type FakeVolumeBinder struct {
config *FakeVolumeBinderConfig
AssumeCalled bool
BindCalled bool
}
// FindPodVolumes implements SchedulerVolumeBinder.FindPodVolumes.
func (b *FakeVolumeBinder) FindPodVolumes(pod *v1.Pod, node *v1.Node) (reasons ConflictReasons, err error) {
return b.config.FindReasons, b.config.FindErr
}
// AssumePodVolumes implements SchedulerVolumeBinder.AssumePodVolumes.
func (b *FakeVolumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (bool, error) {
b.AssumeCalled = true
return b.config.AllBound, b.config.AssumeErr
}
// BindPodVolumes implements SchedulerVolumeBinder.BindPodVolumes.
func (b *FakeVolumeBinder) BindPodVolumes(assumedPod *v1.Pod) error {
b.BindCalled = true
return b.config.BindErr
}
// GetBindingsCache implements SchedulerVolumeBinder.GetBindingsCache.
func (b *FakeVolumeBinder) GetBindingsCache() PodBindingCache {
return nil
}
// DeletePodBindings implements SchedulerVolumeBinder.DeletePodBindings.
func (b *FakeVolumeBinder) DeletePodBindings(pod *v1.Pod) {}