vendor update for E2E framework

Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
This commit is contained in:
Madhu Rajanna
2019-05-31 15:15:11 +05:30
parent 9bb23e4e32
commit d300da19b7
2149 changed files with 598692 additions and 14107 deletions

View File

@ -0,0 +1,265 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"context"
"fmt"
"time"
v1authenticationapi "k8s.io/api/authentication/v1"
"k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
apiserverserviceaccount "k8s.io/apiserver/pkg/authentication/serviceaccount"
clientset "k8s.io/client-go/kubernetes"
v1authentication "k8s.io/client-go/kubernetes/typed/authentication/v1"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
watchtools "k8s.io/client-go/tools/watch"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/api/legacyscheme"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/serviceaccount"
)
// ControllerClientBuilder allows you to get clients and configs for controllers
// Please note a copy also exists in staging/src/k8s.io/cloud-provider/cloud.go
// TODO: Extract this into a separate controller utilities repo (issues/68947)
type ControllerClientBuilder interface {
Config(name string) (*restclient.Config, error)
ConfigOrDie(name string) *restclient.Config
Client(name string) (clientset.Interface, error)
ClientOrDie(name string) clientset.Interface
}
// SimpleControllerClientBuilder returns a fixed client with different user agents
type SimpleControllerClientBuilder struct {
// ClientConfig is a skeleton config to clone and use as the basis for each controller client
ClientConfig *restclient.Config
}
func (b SimpleControllerClientBuilder) Config(name string) (*restclient.Config, error) {
clientConfig := *b.ClientConfig
return restclient.AddUserAgent(&clientConfig, name), nil
}
func (b SimpleControllerClientBuilder) ConfigOrDie(name string) *restclient.Config {
clientConfig, err := b.Config(name)
if err != nil {
klog.Fatal(err)
}
return clientConfig
}
func (b SimpleControllerClientBuilder) Client(name string) (clientset.Interface, error) {
clientConfig, err := b.Config(name)
if err != nil {
return nil, err
}
return clientset.NewForConfig(clientConfig)
}
func (b SimpleControllerClientBuilder) ClientOrDie(name string) clientset.Interface {
client, err := b.Client(name)
if err != nil {
klog.Fatal(err)
}
return client
}
// SAControllerClientBuilder is a ControllerClientBuilder that returns clients identifying as
// service accounts
type SAControllerClientBuilder struct {
// ClientConfig is a skeleton config to clone and use as the basis for each controller client
ClientConfig *restclient.Config
// CoreClient is used to provision service accounts if needed and watch for their associated tokens
// to construct a controller client
CoreClient v1core.CoreV1Interface
// AuthenticationClient is used to check API tokens to make sure they are valid before
// building a controller client from them
AuthenticationClient v1authentication.AuthenticationV1Interface
// Namespace is the namespace used to host the service accounts that will back the
// controllers. It must be highly privileged namespace which normal users cannot inspect.
Namespace string
}
// config returns a complete clientConfig for constructing clients. This is separate in anticipation of composition
// which means that not all clientsets are known here
func (b SAControllerClientBuilder) Config(name string) (*restclient.Config, error) {
sa, err := b.getOrCreateServiceAccount(name)
if err != nil {
return nil, err
}
var clientConfig *restclient.Config
fieldSelector := fields.SelectorFromSet(map[string]string{
api.SecretTypeField: string(v1.SecretTypeServiceAccountToken),
}).String()
lw := &cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
options.FieldSelector = fieldSelector
return b.CoreClient.Secrets(b.Namespace).List(options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
options.FieldSelector = fieldSelector
return b.CoreClient.Secrets(b.Namespace).Watch(options)
},
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
_, err = watchtools.UntilWithSync(ctx, lw, &v1.Secret{}, nil,
func(event watch.Event) (bool, error) {
switch event.Type {
case watch.Deleted:
return false, nil
case watch.Error:
return false, fmt.Errorf("error watching")
case watch.Added, watch.Modified:
secret, ok := event.Object.(*v1.Secret)
if !ok {
return false, fmt.Errorf("unexpected object type: %T", event.Object)
}
if !serviceaccount.IsServiceAccountToken(secret, sa) {
return false, nil
}
if len(secret.Data[v1.ServiceAccountTokenKey]) == 0 {
return false, nil
}
validConfig, valid, err := b.getAuthenticatedConfig(sa, string(secret.Data[v1.ServiceAccountTokenKey]))
if err != nil {
klog.Warningf("error validating API token for %s/%s in secret %s: %v", sa.Namespace, sa.Name, secret.Name, err)
// continue watching for good tokens
return false, nil
}
if !valid {
klog.Warningf("secret %s contained an invalid API token for %s/%s", secret.Name, sa.Namespace, sa.Name)
// try to delete the secret containing the invalid token
if err := b.CoreClient.Secrets(secret.Namespace).Delete(secret.Name, &metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) {
klog.Warningf("error deleting secret %s containing invalid API token for %s/%s: %v", secret.Name, sa.Namespace, sa.Name, err)
}
// continue watching for good tokens
return false, nil
}
clientConfig = validConfig
return true, nil
default:
return false, fmt.Errorf("unexpected event type: %v", event.Type)
}
})
if err != nil {
return nil, fmt.Errorf("unable to get token for service account: %v", err)
}
return clientConfig, nil
}
func (b SAControllerClientBuilder) getOrCreateServiceAccount(name string) (*v1.ServiceAccount, error) {
sa, err := b.CoreClient.ServiceAccounts(b.Namespace).Get(name, metav1.GetOptions{})
if err == nil {
return sa, nil
}
if !apierrors.IsNotFound(err) {
return nil, err
}
// Create the namespace if we can't verify it exists.
// Tolerate errors, since we don't know whether this component has namespace creation permissions.
if _, err := b.CoreClient.Namespaces().Get(b.Namespace, metav1.GetOptions{}); err != nil {
b.CoreClient.Namespaces().Create(&v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: b.Namespace}})
}
// Create the service account
sa, err = b.CoreClient.ServiceAccounts(b.Namespace).Create(&v1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Namespace: b.Namespace, Name: name}})
if apierrors.IsAlreadyExists(err) {
// If we're racing to init and someone else already created it, re-fetch
return b.CoreClient.ServiceAccounts(b.Namespace).Get(name, metav1.GetOptions{})
}
return sa, err
}
func (b SAControllerClientBuilder) getAuthenticatedConfig(sa *v1.ServiceAccount, token string) (*restclient.Config, bool, error) {
username := apiserverserviceaccount.MakeUsername(sa.Namespace, sa.Name)
clientConfig := restclient.AnonymousClientConfig(b.ClientConfig)
clientConfig.BearerToken = token
restclient.AddUserAgent(clientConfig, username)
// Try token review first
tokenReview := &v1authenticationapi.TokenReview{Spec: v1authenticationapi.TokenReviewSpec{Token: token}}
if tokenResult, err := b.AuthenticationClient.TokenReviews().Create(tokenReview); err == nil {
if !tokenResult.Status.Authenticated {
klog.Warningf("Token for %s/%s did not authenticate correctly", sa.Namespace, sa.Name)
return nil, false, nil
}
if tokenResult.Status.User.Username != username {
klog.Warningf("Token for %s/%s authenticated as unexpected username: %s", sa.Namespace, sa.Name, tokenResult.Status.User.Username)
return nil, false, nil
}
klog.V(4).Infof("Verified credential for %s/%s", sa.Namespace, sa.Name)
return clientConfig, true, nil
}
// If we couldn't run the token review, the API might be disabled or we might not have permission.
// Try to make a request to /apis with the token. If we get a 401 we should consider the token invalid.
clientConfigCopy := *clientConfig
clientConfigCopy.NegotiatedSerializer = legacyscheme.Codecs
client, err := restclient.UnversionedRESTClientFor(&clientConfigCopy)
if err != nil {
return nil, false, err
}
err = client.Get().AbsPath("/apis").Do().Error()
if apierrors.IsUnauthorized(err) {
klog.Warningf("Token for %s/%s did not authenticate correctly: %v", sa.Namespace, sa.Name, err)
return nil, false, nil
}
return clientConfig, true, nil
}
func (b SAControllerClientBuilder) ConfigOrDie(name string) *restclient.Config {
clientConfig, err := b.Config(name)
if err != nil {
klog.Fatal(err)
}
return clientConfig
}
func (b SAControllerClientBuilder) Client(name string) (clientset.Interface, error) {
clientConfig, err := b.Config(name)
if err != nil {
return nil, err
}
return clientset.NewForConfig(clientConfig)
}
func (b SAControllerClientBuilder) ClientOrDie(name string) clientset.Interface {
client, err := b.Client(name)
if err != nil {
klog.Fatal(err)
}
return client
}

View File

@ -0,0 +1,501 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"fmt"
"sync"
apps "k8s.io/api/apps/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog"
)
type BaseControllerRefManager struct {
Controller metav1.Object
Selector labels.Selector
canAdoptErr error
canAdoptOnce sync.Once
CanAdoptFunc func() error
}
func (m *BaseControllerRefManager) CanAdopt() error {
m.canAdoptOnce.Do(func() {
if m.CanAdoptFunc != nil {
m.canAdoptErr = m.CanAdoptFunc()
}
})
return m.canAdoptErr
}
// ClaimObject tries to take ownership of an object for this controller.
//
// It will reconcile the following:
// * Adopt orphans if the match function returns true.
// * Release owned objects if the match function returns false.
//
// A non-nil error is returned if some form of reconciliation was attempted and
// failed. Usually, controllers should try again later in case reconciliation
// is still needed.
//
// If the error is nil, either the reconciliation succeeded, or no
// reconciliation was necessary. The returned boolean indicates whether you now
// own the object.
//
// No reconciliation will be attempted if the controller is being deleted.
func (m *BaseControllerRefManager) ClaimObject(obj metav1.Object, match func(metav1.Object) bool, adopt, release func(metav1.Object) error) (bool, error) {
controllerRef := metav1.GetControllerOf(obj)
if controllerRef != nil {
if controllerRef.UID != m.Controller.GetUID() {
// Owned by someone else. Ignore.
return false, nil
}
if match(obj) {
// We already own it and the selector matches.
// Return true (successfully claimed) before checking deletion timestamp.
// We're still allowed to claim things we already own while being deleted
// because doing so requires taking no actions.
return true, nil
}
// Owned by us but selector doesn't match.
// Try to release, unless we're being deleted.
if m.Controller.GetDeletionTimestamp() != nil {
return false, nil
}
if err := release(obj); err != nil {
// If the pod no longer exists, ignore the error.
if errors.IsNotFound(err) {
return false, nil
}
// Either someone else released it, or there was a transient error.
// The controller should requeue and try again if it's still stale.
return false, err
}
// Successfully released.
return false, nil
}
// It's an orphan.
if m.Controller.GetDeletionTimestamp() != nil || !match(obj) {
// Ignore if we're being deleted or selector doesn't match.
return false, nil
}
if obj.GetDeletionTimestamp() != nil {
// Ignore if the object is being deleted
return false, nil
}
// Selector matches. Try to adopt.
if err := adopt(obj); err != nil {
// If the pod no longer exists, ignore the error.
if errors.IsNotFound(err) {
return false, nil
}
// Either someone else claimed it first, or there was a transient error.
// The controller should requeue and try again if it's still orphaned.
return false, err
}
// Successfully adopted.
return true, nil
}
type PodControllerRefManager struct {
BaseControllerRefManager
controllerKind schema.GroupVersionKind
podControl PodControlInterface
}
// NewPodControllerRefManager returns a PodControllerRefManager that exposes
// methods to manage the controllerRef of pods.
//
// The CanAdopt() function can be used to perform a potentially expensive check
// (such as a live GET from the API server) prior to the first adoption.
// It will only be called (at most once) if an adoption is actually attempted.
// If CanAdopt() returns a non-nil error, all adoptions will fail.
//
// NOTE: Once CanAdopt() is called, it will not be called again by the same
// PodControllerRefManager instance. Create a new instance if it makes
// sense to check CanAdopt() again (e.g. in a different sync pass).
func NewPodControllerRefManager(
podControl PodControlInterface,
controller metav1.Object,
selector labels.Selector,
controllerKind schema.GroupVersionKind,
canAdopt func() error,
) *PodControllerRefManager {
return &PodControllerRefManager{
BaseControllerRefManager: BaseControllerRefManager{
Controller: controller,
Selector: selector,
CanAdoptFunc: canAdopt,
},
controllerKind: controllerKind,
podControl: podControl,
}
}
// ClaimPods tries to take ownership of a list of Pods.
//
// It will reconcile the following:
// * Adopt orphans if the selector matches.
// * Release owned objects if the selector no longer matches.
//
// Optional: If one or more filters are specified, a Pod will only be claimed if
// all filters return true.
//
// A non-nil error is returned if some form of reconciliation was attempted and
// failed. Usually, controllers should try again later in case reconciliation
// is still needed.
//
// If the error is nil, either the reconciliation succeeded, or no
// reconciliation was necessary. The list of Pods that you now own is returned.
func (m *PodControllerRefManager) ClaimPods(pods []*v1.Pod, filters ...func(*v1.Pod) bool) ([]*v1.Pod, error) {
var claimed []*v1.Pod
var errlist []error
match := func(obj metav1.Object) bool {
pod := obj.(*v1.Pod)
// Check selector first so filters only run on potentially matching Pods.
if !m.Selector.Matches(labels.Set(pod.Labels)) {
return false
}
for _, filter := range filters {
if !filter(pod) {
return false
}
}
return true
}
adopt := func(obj metav1.Object) error {
return m.AdoptPod(obj.(*v1.Pod))
}
release := func(obj metav1.Object) error {
return m.ReleasePod(obj.(*v1.Pod))
}
for _, pod := range pods {
ok, err := m.ClaimObject(pod, match, adopt, release)
if err != nil {
errlist = append(errlist, err)
continue
}
if ok {
claimed = append(claimed, pod)
}
}
return claimed, utilerrors.NewAggregate(errlist)
}
// AdoptPod sends a patch to take control of the pod. It returns the error if
// the patching fails.
func (m *PodControllerRefManager) AdoptPod(pod *v1.Pod) error {
if err := m.CanAdopt(); err != nil {
return fmt.Errorf("can't adopt Pod %v/%v (%v): %v", pod.Namespace, pod.Name, pod.UID, err)
}
// Note that ValidateOwnerReferences() will reject this patch if another
// OwnerReference exists with controller=true.
addControllerPatch := fmt.Sprintf(
`{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`,
m.controllerKind.GroupVersion(), m.controllerKind.Kind,
m.Controller.GetName(), m.Controller.GetUID(), pod.UID)
return m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(addControllerPatch))
}
// ReleasePod sends a patch to free the pod from the control of the controller.
// It returns the error if the patching fails. 404 and 422 errors are ignored.
func (m *PodControllerRefManager) ReleasePod(pod *v1.Pod) error {
klog.V(2).Infof("patching pod %s_%s to remove its controllerRef to %s/%s:%s",
pod.Namespace, pod.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName())
deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), pod.UID)
err := m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(deleteOwnerRefPatch))
if err != nil {
if errors.IsNotFound(err) {
// If the pod no longer exists, ignore it.
return nil
}
if errors.IsInvalid(err) {
// Invalid error will be returned in two cases: 1. the pod
// has no owner reference, 2. the uid of the pod doesn't
// match, which means the pod is deleted and then recreated.
// In both cases, the error can be ignored.
// TODO: If the pod has owner references, but none of them
// has the owner.UID, server will silently ignore the patch.
// Investigate why.
return nil
}
}
return err
}
// ReplicaSetControllerRefManager is used to manage controllerRef of ReplicaSets.
// Three methods are defined on this object 1: Classify 2: AdoptReplicaSet and
// 3: ReleaseReplicaSet which are used to classify the ReplicaSets into appropriate
// categories and accordingly adopt or release them. See comments on these functions
// for more details.
type ReplicaSetControllerRefManager struct {
BaseControllerRefManager
controllerKind schema.GroupVersionKind
rsControl RSControlInterface
}
// NewReplicaSetControllerRefManager returns a ReplicaSetControllerRefManager that exposes
// methods to manage the controllerRef of ReplicaSets.
//
// The CanAdopt() function can be used to perform a potentially expensive check
// (such as a live GET from the API server) prior to the first adoption.
// It will only be called (at most once) if an adoption is actually attempted.
// If CanAdopt() returns a non-nil error, all adoptions will fail.
//
// NOTE: Once CanAdopt() is called, it will not be called again by the same
// ReplicaSetControllerRefManager instance. Create a new instance if it
// makes sense to check CanAdopt() again (e.g. in a different sync pass).
func NewReplicaSetControllerRefManager(
rsControl RSControlInterface,
controller metav1.Object,
selector labels.Selector,
controllerKind schema.GroupVersionKind,
canAdopt func() error,
) *ReplicaSetControllerRefManager {
return &ReplicaSetControllerRefManager{
BaseControllerRefManager: BaseControllerRefManager{
Controller: controller,
Selector: selector,
CanAdoptFunc: canAdopt,
},
controllerKind: controllerKind,
rsControl: rsControl,
}
}
// ClaimReplicaSets tries to take ownership of a list of ReplicaSets.
//
// It will reconcile the following:
// * Adopt orphans if the selector matches.
// * Release owned objects if the selector no longer matches.
//
// A non-nil error is returned if some form of reconciliation was attempted and
// failed. Usually, controllers should try again later in case reconciliation
// is still needed.
//
// If the error is nil, either the reconciliation succeeded, or no
// reconciliation was necessary. The list of ReplicaSets that you now own is
// returned.
func (m *ReplicaSetControllerRefManager) ClaimReplicaSets(sets []*apps.ReplicaSet) ([]*apps.ReplicaSet, error) {
var claimed []*apps.ReplicaSet
var errlist []error
match := func(obj metav1.Object) bool {
return m.Selector.Matches(labels.Set(obj.GetLabels()))
}
adopt := func(obj metav1.Object) error {
return m.AdoptReplicaSet(obj.(*apps.ReplicaSet))
}
release := func(obj metav1.Object) error {
return m.ReleaseReplicaSet(obj.(*apps.ReplicaSet))
}
for _, rs := range sets {
ok, err := m.ClaimObject(rs, match, adopt, release)
if err != nil {
errlist = append(errlist, err)
continue
}
if ok {
claimed = append(claimed, rs)
}
}
return claimed, utilerrors.NewAggregate(errlist)
}
// AdoptReplicaSet sends a patch to take control of the ReplicaSet. It returns
// the error if the patching fails.
func (m *ReplicaSetControllerRefManager) AdoptReplicaSet(rs *apps.ReplicaSet) error {
if err := m.CanAdopt(); err != nil {
return fmt.Errorf("can't adopt ReplicaSet %v/%v (%v): %v", rs.Namespace, rs.Name, rs.UID, err)
}
// Note that ValidateOwnerReferences() will reject this patch if another
// OwnerReference exists with controller=true.
addControllerPatch := fmt.Sprintf(
`{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`,
m.controllerKind.GroupVersion(), m.controllerKind.Kind,
m.Controller.GetName(), m.Controller.GetUID(), rs.UID)
return m.rsControl.PatchReplicaSet(rs.Namespace, rs.Name, []byte(addControllerPatch))
}
// ReleaseReplicaSet sends a patch to free the ReplicaSet from the control of the Deployment controller.
// It returns the error if the patching fails. 404 and 422 errors are ignored.
func (m *ReplicaSetControllerRefManager) ReleaseReplicaSet(replicaSet *apps.ReplicaSet) error {
klog.V(2).Infof("patching ReplicaSet %s_%s to remove its controllerRef to %s/%s:%s",
replicaSet.Namespace, replicaSet.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName())
deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), replicaSet.UID)
err := m.rsControl.PatchReplicaSet(replicaSet.Namespace, replicaSet.Name, []byte(deleteOwnerRefPatch))
if err != nil {
if errors.IsNotFound(err) {
// If the ReplicaSet no longer exists, ignore it.
return nil
}
if errors.IsInvalid(err) {
// Invalid error will be returned in two cases: 1. the ReplicaSet
// has no owner reference, 2. the uid of the ReplicaSet doesn't
// match, which means the ReplicaSet is deleted and then recreated.
// In both cases, the error can be ignored.
return nil
}
}
return err
}
// RecheckDeletionTimestamp returns a CanAdopt() function to recheck deletion.
//
// The CanAdopt() function calls getObject() to fetch the latest value,
// and denies adoption attempts if that object has a non-nil DeletionTimestamp.
func RecheckDeletionTimestamp(getObject func() (metav1.Object, error)) func() error {
return func() error {
obj, err := getObject()
if err != nil {
return fmt.Errorf("can't recheck DeletionTimestamp: %v", err)
}
if obj.GetDeletionTimestamp() != nil {
return fmt.Errorf("%v/%v has just been deleted at %v", obj.GetNamespace(), obj.GetName(), obj.GetDeletionTimestamp())
}
return nil
}
}
// ControllerRevisionControllerRefManager is used to manage controllerRef of ControllerRevisions.
// Three methods are defined on this object 1: Classify 2: AdoptControllerRevision and
// 3: ReleaseControllerRevision which are used to classify the ControllerRevisions into appropriate
// categories and accordingly adopt or release them. See comments on these functions
// for more details.
type ControllerRevisionControllerRefManager struct {
BaseControllerRefManager
controllerKind schema.GroupVersionKind
crControl ControllerRevisionControlInterface
}
// NewControllerRevisionControllerRefManager returns a ControllerRevisionControllerRefManager that exposes
// methods to manage the controllerRef of ControllerRevisions.
//
// The canAdopt() function can be used to perform a potentially expensive check
// (such as a live GET from the API server) prior to the first adoption.
// It will only be called (at most once) if an adoption is actually attempted.
// If canAdopt() returns a non-nil error, all adoptions will fail.
//
// NOTE: Once canAdopt() is called, it will not be called again by the same
// ControllerRevisionControllerRefManager instance. Create a new instance if it
// makes sense to check canAdopt() again (e.g. in a different sync pass).
func NewControllerRevisionControllerRefManager(
crControl ControllerRevisionControlInterface,
controller metav1.Object,
selector labels.Selector,
controllerKind schema.GroupVersionKind,
canAdopt func() error,
) *ControllerRevisionControllerRefManager {
return &ControllerRevisionControllerRefManager{
BaseControllerRefManager: BaseControllerRefManager{
Controller: controller,
Selector: selector,
CanAdoptFunc: canAdopt,
},
controllerKind: controllerKind,
crControl: crControl,
}
}
// ClaimControllerRevisions tries to take ownership of a list of ControllerRevisions.
//
// It will reconcile the following:
// * Adopt orphans if the selector matches.
// * Release owned objects if the selector no longer matches.
//
// A non-nil error is returned if some form of reconciliation was attempted and
// failed. Usually, controllers should try again later in case reconciliation
// is still needed.
//
// If the error is nil, either the reconciliation succeeded, or no
// reconciliation was necessary. The list of ControllerRevisions that you now own is
// returned.
func (m *ControllerRevisionControllerRefManager) ClaimControllerRevisions(histories []*apps.ControllerRevision) ([]*apps.ControllerRevision, error) {
var claimed []*apps.ControllerRevision
var errlist []error
match := func(obj metav1.Object) bool {
return m.Selector.Matches(labels.Set(obj.GetLabels()))
}
adopt := func(obj metav1.Object) error {
return m.AdoptControllerRevision(obj.(*apps.ControllerRevision))
}
release := func(obj metav1.Object) error {
return m.ReleaseControllerRevision(obj.(*apps.ControllerRevision))
}
for _, h := range histories {
ok, err := m.ClaimObject(h, match, adopt, release)
if err != nil {
errlist = append(errlist, err)
continue
}
if ok {
claimed = append(claimed, h)
}
}
return claimed, utilerrors.NewAggregate(errlist)
}
// AdoptControllerRevision sends a patch to take control of the ControllerRevision. It returns the error if
// the patching fails.
func (m *ControllerRevisionControllerRefManager) AdoptControllerRevision(history *apps.ControllerRevision) error {
if err := m.CanAdopt(); err != nil {
return fmt.Errorf("can't adopt ControllerRevision %v/%v (%v): %v", history.Namespace, history.Name, history.UID, err)
}
// Note that ValidateOwnerReferences() will reject this patch if another
// OwnerReference exists with controller=true.
addControllerPatch := fmt.Sprintf(
`{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`,
m.controllerKind.GroupVersion(), m.controllerKind.Kind,
m.Controller.GetName(), m.Controller.GetUID(), history.UID)
return m.crControl.PatchControllerRevision(history.Namespace, history.Name, []byte(addControllerPatch))
}
// ReleaseControllerRevision sends a patch to free the ControllerRevision from the control of its controller.
// It returns the error if the patching fails. 404 and 422 errors are ignored.
func (m *ControllerRevisionControllerRefManager) ReleaseControllerRevision(history *apps.ControllerRevision) error {
klog.V(2).Infof("patching ControllerRevision %s_%s to remove its controllerRef to %s/%s:%s",
history.Namespace, history.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName())
deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), history.UID)
err := m.crControl.PatchControllerRevision(history.Namespace, history.Name, []byte(deleteOwnerRefPatch))
if err != nil {
if errors.IsNotFound(err) {
// If the ControllerRevision no longer exists, ignore it.
return nil
}
if errors.IsInvalid(err) {
// Invalid error will be returned in two cases: 1. the ControllerRevision
// has no owner reference, 2. the uid of the ControllerRevision doesn't
// match, which means the ControllerRevision is deleted and then recreated.
// In both cases, the error can be ignored.
return nil
}
}
return err
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,903 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"time"
"k8s.io/klog"
apps "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
intstrutil "k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/wait"
appsclient "k8s.io/client-go/kubernetes/typed/apps/v1"
"k8s.io/kubernetes/pkg/controller"
labelsutil "k8s.io/kubernetes/pkg/util/labels"
"k8s.io/utils/integer"
)
const (
// RevisionAnnotation is the revision annotation of a deployment's replica sets which records its rollout sequence
RevisionAnnotation = "deployment.kubernetes.io/revision"
// RevisionHistoryAnnotation maintains the history of all old revisions that a replica set has served for a deployment.
RevisionHistoryAnnotation = "deployment.kubernetes.io/revision-history"
// DesiredReplicasAnnotation is the desired replicas for a deployment recorded as an annotation
// in its replica sets. Helps in separating scaling events from the rollout process and for
// determining if the new replica set for a deployment is really saturated.
DesiredReplicasAnnotation = "deployment.kubernetes.io/desired-replicas"
// MaxReplicasAnnotation is the maximum replicas a deployment can have at a given point, which
// is deployment.spec.replicas + maxSurge. Used by the underlying replica sets to estimate their
// proportions in case the deployment has surge replicas.
MaxReplicasAnnotation = "deployment.kubernetes.io/max-replicas"
// RollbackRevisionNotFound is not found rollback event reason
RollbackRevisionNotFound = "DeploymentRollbackRevisionNotFound"
// RollbackTemplateUnchanged is the template unchanged rollback event reason
RollbackTemplateUnchanged = "DeploymentRollbackTemplateUnchanged"
// RollbackDone is the done rollback event reason
RollbackDone = "DeploymentRollback"
// Reasons for deployment conditions
//
// Progressing:
// ReplicaSetUpdatedReason is added in a deployment when one of its replica sets is updated as part
// of the rollout process.
ReplicaSetUpdatedReason = "ReplicaSetUpdated"
// FailedRSCreateReason is added in a deployment when it cannot create a new replica set.
FailedRSCreateReason = "ReplicaSetCreateError"
// NewReplicaSetReason is added in a deployment when it creates a new replica set.
NewReplicaSetReason = "NewReplicaSetCreated"
// FoundNewRSReason is added in a deployment when it adopts an existing replica set.
FoundNewRSReason = "FoundNewReplicaSet"
// NewRSAvailableReason is added in a deployment when its newest replica set is made available
// ie. the number of new pods that have passed readiness checks and run for at least minReadySeconds
// is at least the minimum available pods that need to run for the deployment.
NewRSAvailableReason = "NewReplicaSetAvailable"
// TimedOutReason is added in a deployment when its newest replica set fails to show any progress
// within the given deadline (progressDeadlineSeconds).
TimedOutReason = "ProgressDeadlineExceeded"
// PausedDeployReason is added in a deployment when it is paused. Lack of progress shouldn't be
// estimated once a deployment is paused.
PausedDeployReason = "DeploymentPaused"
// ResumedDeployReason is added in a deployment when it is resumed. Useful for not failing accidentally
// deployments that paused amidst a rollout and are bounded by a deadline.
ResumedDeployReason = "DeploymentResumed"
//
// Available:
// MinimumReplicasAvailable is added in a deployment when it has its minimum replicas required available.
MinimumReplicasAvailable = "MinimumReplicasAvailable"
// MinimumReplicasUnavailable is added in a deployment when it doesn't have the minimum required replicas
// available.
MinimumReplicasUnavailable = "MinimumReplicasUnavailable"
)
// NewDeploymentCondition creates a new deployment condition.
func NewDeploymentCondition(condType apps.DeploymentConditionType, status v1.ConditionStatus, reason, message string) *apps.DeploymentCondition {
return &apps.DeploymentCondition{
Type: condType,
Status: status,
LastUpdateTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
}
}
// GetDeploymentCondition returns the condition with the provided type.
func GetDeploymentCondition(status apps.DeploymentStatus, condType apps.DeploymentConditionType) *apps.DeploymentCondition {
for i := range status.Conditions {
c := status.Conditions[i]
if c.Type == condType {
return &c
}
}
return nil
}
// SetDeploymentCondition updates the deployment to include the provided condition. If the condition that
// we are about to add already exists and has the same status and reason then we are not going to update.
func SetDeploymentCondition(status *apps.DeploymentStatus, condition apps.DeploymentCondition) {
currentCond := GetDeploymentCondition(*status, condition.Type)
if currentCond != nil && currentCond.Status == condition.Status && currentCond.Reason == condition.Reason {
return
}
// Do not update lastTransitionTime if the status of the condition doesn't change.
if currentCond != nil && currentCond.Status == condition.Status {
condition.LastTransitionTime = currentCond.LastTransitionTime
}
newConditions := filterOutCondition(status.Conditions, condition.Type)
status.Conditions = append(newConditions, condition)
}
// RemoveDeploymentCondition removes the deployment condition with the provided type.
func RemoveDeploymentCondition(status *apps.DeploymentStatus, condType apps.DeploymentConditionType) {
status.Conditions = filterOutCondition(status.Conditions, condType)
}
// filterOutCondition returns a new slice of deployment conditions without conditions with the provided type.
func filterOutCondition(conditions []apps.DeploymentCondition, condType apps.DeploymentConditionType) []apps.DeploymentCondition {
var newConditions []apps.DeploymentCondition
for _, c := range conditions {
if c.Type == condType {
continue
}
newConditions = append(newConditions, c)
}
return newConditions
}
// ReplicaSetToDeploymentCondition converts a replica set condition into a deployment condition.
// Useful for promoting replica set failure conditions into deployments.
func ReplicaSetToDeploymentCondition(cond apps.ReplicaSetCondition) apps.DeploymentCondition {
return apps.DeploymentCondition{
Type: apps.DeploymentConditionType(cond.Type),
Status: cond.Status,
LastTransitionTime: cond.LastTransitionTime,
LastUpdateTime: cond.LastTransitionTime,
Reason: cond.Reason,
Message: cond.Message,
}
}
// SetDeploymentRevision updates the revision for a deployment.
func SetDeploymentRevision(deployment *apps.Deployment, revision string) bool {
updated := false
if deployment.Annotations == nil {
deployment.Annotations = make(map[string]string)
}
if deployment.Annotations[RevisionAnnotation] != revision {
deployment.Annotations[RevisionAnnotation] = revision
updated = true
}
return updated
}
// MaxRevision finds the highest revision in the replica sets
func MaxRevision(allRSs []*apps.ReplicaSet) int64 {
max := int64(0)
for _, rs := range allRSs {
if v, err := Revision(rs); err != nil {
// Skip the replica sets when it failed to parse their revision information
klog.V(4).Infof("Error: %v. Couldn't parse revision for replica set %#v, deployment controller will skip it when reconciling revisions.", err, rs)
} else if v > max {
max = v
}
}
return max
}
// LastRevision finds the second max revision number in all replica sets (the last revision)
func LastRevision(allRSs []*apps.ReplicaSet) int64 {
max, secMax := int64(0), int64(0)
for _, rs := range allRSs {
if v, err := Revision(rs); err != nil {
// Skip the replica sets when it failed to parse their revision information
klog.V(4).Infof("Error: %v. Couldn't parse revision for replica set %#v, deployment controller will skip it when reconciling revisions.", err, rs)
} else if v >= max {
secMax = max
max = v
} else if v > secMax {
secMax = v
}
}
return secMax
}
// Revision returns the revision number of the input object.
func Revision(obj runtime.Object) (int64, error) {
acc, err := meta.Accessor(obj)
if err != nil {
return 0, err
}
v, ok := acc.GetAnnotations()[RevisionAnnotation]
if !ok {
return 0, nil
}
return strconv.ParseInt(v, 10, 64)
}
// SetNewReplicaSetAnnotations sets new replica set's annotations appropriately by updating its revision and
// copying required deployment annotations to it; it returns true if replica set's annotation is changed.
func SetNewReplicaSetAnnotations(deployment *apps.Deployment, newRS *apps.ReplicaSet, newRevision string, exists bool) bool {
// First, copy deployment's annotations (except for apply and revision annotations)
annotationChanged := copyDeploymentAnnotationsToReplicaSet(deployment, newRS)
// Then, update replica set's revision annotation
if newRS.Annotations == nil {
newRS.Annotations = make(map[string]string)
}
oldRevision, ok := newRS.Annotations[RevisionAnnotation]
// The newRS's revision should be the greatest among all RSes. Usually, its revision number is newRevision (the max revision number
// of all old RSes + 1). However, it's possible that some of the old RSes are deleted after the newRS revision being updated, and
// newRevision becomes smaller than newRS's revision. We should only update newRS revision when it's smaller than newRevision.
oldRevisionInt, err := strconv.ParseInt(oldRevision, 10, 64)
if err != nil {
if oldRevision != "" {
klog.Warningf("Updating replica set revision OldRevision not int %s", err)
return false
}
//If the RS annotation is empty then initialise it to 0
oldRevisionInt = 0
}
newRevisionInt, err := strconv.ParseInt(newRevision, 10, 64)
if err != nil {
klog.Warningf("Updating replica set revision NewRevision not int %s", err)
return false
}
if oldRevisionInt < newRevisionInt {
newRS.Annotations[RevisionAnnotation] = newRevision
annotationChanged = true
klog.V(4).Infof("Updating replica set %q revision to %s", newRS.Name, newRevision)
}
// If a revision annotation already existed and this replica set was updated with a new revision
// then that means we are rolling back to this replica set. We need to preserve the old revisions
// for historical information.
if ok && annotationChanged {
revisionHistoryAnnotation := newRS.Annotations[RevisionHistoryAnnotation]
oldRevisions := strings.Split(revisionHistoryAnnotation, ",")
if len(oldRevisions[0]) == 0 {
newRS.Annotations[RevisionHistoryAnnotation] = oldRevision
} else {
oldRevisions = append(oldRevisions, oldRevision)
newRS.Annotations[RevisionHistoryAnnotation] = strings.Join(oldRevisions, ",")
}
}
// If the new replica set is about to be created, we need to add replica annotations to it.
if !exists && SetReplicasAnnotations(newRS, *(deployment.Spec.Replicas), *(deployment.Spec.Replicas)+MaxSurge(*deployment)) {
annotationChanged = true
}
return annotationChanged
}
var annotationsToSkip = map[string]bool{
v1.LastAppliedConfigAnnotation: true,
RevisionAnnotation: true,
RevisionHistoryAnnotation: true,
DesiredReplicasAnnotation: true,
MaxReplicasAnnotation: true,
apps.DeprecatedRollbackTo: true,
}
// skipCopyAnnotation returns true if we should skip copying the annotation with the given annotation key
// TODO: How to decide which annotations should / should not be copied?
// See https://github.com/kubernetes/kubernetes/pull/20035#issuecomment-179558615
func skipCopyAnnotation(key string) bool {
return annotationsToSkip[key]
}
// copyDeploymentAnnotationsToReplicaSet copies deployment's annotations to replica set's annotations,
// and returns true if replica set's annotation is changed.
// Note that apply and revision annotations are not copied.
func copyDeploymentAnnotationsToReplicaSet(deployment *apps.Deployment, rs *apps.ReplicaSet) bool {
rsAnnotationsChanged := false
if rs.Annotations == nil {
rs.Annotations = make(map[string]string)
}
for k, v := range deployment.Annotations {
// newRS revision is updated automatically in getNewReplicaSet, and the deployment's revision number is then updated
// by copying its newRS revision number. We should not copy deployment's revision to its newRS, since the update of
// deployment revision number may fail (revision becomes stale) and the revision number in newRS is more reliable.
if skipCopyAnnotation(k) || rs.Annotations[k] == v {
continue
}
rs.Annotations[k] = v
rsAnnotationsChanged = true
}
return rsAnnotationsChanged
}
// SetDeploymentAnnotationsTo sets deployment's annotations as given RS's annotations.
// This action should be done if and only if the deployment is rolling back to this rs.
// Note that apply and revision annotations are not changed.
func SetDeploymentAnnotationsTo(deployment *apps.Deployment, rollbackToRS *apps.ReplicaSet) {
deployment.Annotations = getSkippedAnnotations(deployment.Annotations)
for k, v := range rollbackToRS.Annotations {
if !skipCopyAnnotation(k) {
deployment.Annotations[k] = v
}
}
}
func getSkippedAnnotations(annotations map[string]string) map[string]string {
skippedAnnotations := make(map[string]string)
for k, v := range annotations {
if skipCopyAnnotation(k) {
skippedAnnotations[k] = v
}
}
return skippedAnnotations
}
// FindActiveOrLatest returns the only active or the latest replica set in case there is at most one active
// replica set. If there are more active replica sets, then we should proportionally scale them.
func FindActiveOrLatest(newRS *apps.ReplicaSet, oldRSs []*apps.ReplicaSet) *apps.ReplicaSet {
if newRS == nil && len(oldRSs) == 0 {
return nil
}
sort.Sort(sort.Reverse(controller.ReplicaSetsByCreationTimestamp(oldRSs)))
allRSs := controller.FilterActiveReplicaSets(append(oldRSs, newRS))
switch len(allRSs) {
case 0:
// If there is no active replica set then we should return the newest.
if newRS != nil {
return newRS
}
return oldRSs[0]
case 1:
return allRSs[0]
default:
return nil
}
}
// GetDesiredReplicasAnnotation returns the number of desired replicas
func GetDesiredReplicasAnnotation(rs *apps.ReplicaSet) (int32, bool) {
return getIntFromAnnotation(rs, DesiredReplicasAnnotation)
}
func getMaxReplicasAnnotation(rs *apps.ReplicaSet) (int32, bool) {
return getIntFromAnnotation(rs, MaxReplicasAnnotation)
}
func getIntFromAnnotation(rs *apps.ReplicaSet, annotationKey string) (int32, bool) {
annotationValue, ok := rs.Annotations[annotationKey]
if !ok {
return int32(0), false
}
intValue, err := strconv.Atoi(annotationValue)
if err != nil {
klog.V(2).Infof("Cannot convert the value %q with annotation key %q for the replica set %q", annotationValue, annotationKey, rs.Name)
return int32(0), false
}
return int32(intValue), true
}
// SetReplicasAnnotations sets the desiredReplicas and maxReplicas into the annotations
func SetReplicasAnnotations(rs *apps.ReplicaSet, desiredReplicas, maxReplicas int32) bool {
updated := false
if rs.Annotations == nil {
rs.Annotations = make(map[string]string)
}
desiredString := fmt.Sprintf("%d", desiredReplicas)
if hasString := rs.Annotations[DesiredReplicasAnnotation]; hasString != desiredString {
rs.Annotations[DesiredReplicasAnnotation] = desiredString
updated = true
}
maxString := fmt.Sprintf("%d", maxReplicas)
if hasString := rs.Annotations[MaxReplicasAnnotation]; hasString != maxString {
rs.Annotations[MaxReplicasAnnotation] = maxString
updated = true
}
return updated
}
// ReplicasAnnotationsNeedUpdate return true if ReplicasAnnotations need to be updated
func ReplicasAnnotationsNeedUpdate(rs *apps.ReplicaSet, desiredReplicas, maxReplicas int32) bool {
if rs.Annotations == nil {
return true
}
desiredString := fmt.Sprintf("%d", desiredReplicas)
if hasString := rs.Annotations[DesiredReplicasAnnotation]; hasString != desiredString {
return true
}
maxString := fmt.Sprintf("%d", maxReplicas)
if hasString := rs.Annotations[MaxReplicasAnnotation]; hasString != maxString {
return true
}
return false
}
// MaxUnavailable returns the maximum unavailable pods a rolling deployment can take.
func MaxUnavailable(deployment apps.Deployment) int32 {
if !IsRollingUpdate(&deployment) || *(deployment.Spec.Replicas) == 0 {
return int32(0)
}
// Error caught by validation
_, maxUnavailable, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas))
if maxUnavailable > *deployment.Spec.Replicas {
return *deployment.Spec.Replicas
}
return maxUnavailable
}
// MinAvailable returns the minimum available pods of a given deployment
func MinAvailable(deployment *apps.Deployment) int32 {
if !IsRollingUpdate(deployment) {
return int32(0)
}
return *(deployment.Spec.Replicas) - MaxUnavailable(*deployment)
}
// MaxSurge returns the maximum surge pods a rolling deployment can take.
func MaxSurge(deployment apps.Deployment) int32 {
if !IsRollingUpdate(&deployment) {
return int32(0)
}
// Error caught by validation
maxSurge, _, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas))
return maxSurge
}
// GetProportion will estimate the proportion for the provided replica set using 1. the current size
// of the parent deployment, 2. the replica count that needs be added on the replica sets of the
// deployment, and 3. the total replicas added in the replica sets of the deployment so far.
func GetProportion(rs *apps.ReplicaSet, d apps.Deployment, deploymentReplicasToAdd, deploymentReplicasAdded int32) int32 {
if rs == nil || *(rs.Spec.Replicas) == 0 || deploymentReplicasToAdd == 0 || deploymentReplicasToAdd == deploymentReplicasAdded {
return int32(0)
}
rsFraction := getReplicaSetFraction(*rs, d)
allowed := deploymentReplicasToAdd - deploymentReplicasAdded
if deploymentReplicasToAdd > 0 {
// Use the minimum between the replica set fraction and the maximum allowed replicas
// when scaling up. This way we ensure we will not scale up more than the allowed
// replicas we can add.
return integer.Int32Min(rsFraction, allowed)
}
// Use the maximum between the replica set fraction and the maximum allowed replicas
// when scaling down. This way we ensure we will not scale down more than the allowed
// replicas we can remove.
return integer.Int32Max(rsFraction, allowed)
}
// getReplicaSetFraction estimates the fraction of replicas a replica set can have in
// 1. a scaling event during a rollout or 2. when scaling a paused deployment.
func getReplicaSetFraction(rs apps.ReplicaSet, d apps.Deployment) int32 {
// If we are scaling down to zero then the fraction of this replica set is its whole size (negative)
if *(d.Spec.Replicas) == int32(0) {
return -*(rs.Spec.Replicas)
}
deploymentReplicas := *(d.Spec.Replicas) + MaxSurge(d)
annotatedReplicas, ok := getMaxReplicasAnnotation(&rs)
if !ok {
// If we cannot find the annotation then fallback to the current deployment size. Note that this
// will not be an accurate proportion estimation in case other replica sets have different values
// which means that the deployment was scaled at some point but we at least will stay in limits
// due to the min-max comparisons in getProportion.
annotatedReplicas = d.Status.Replicas
}
// We should never proportionally scale up from zero which means rs.spec.replicas and annotatedReplicas
// will never be zero here.
newRSsize := (float64(*(rs.Spec.Replicas) * deploymentReplicas)) / float64(annotatedReplicas)
return integer.RoundToInt32(newRSsize) - *(rs.Spec.Replicas)
}
// GetAllReplicaSets returns the old and new replica sets targeted by the given Deployment. It gets PodList and ReplicaSetList from client interface.
// Note that the first set of old replica sets doesn't include the ones with no pods, and the second set of old replica sets include all old replica sets.
// The third returned value is the new replica set, and it may be nil if it doesn't exist yet.
func GetAllReplicaSets(deployment *apps.Deployment, c appsclient.AppsV1Interface) ([]*apps.ReplicaSet, []*apps.ReplicaSet, *apps.ReplicaSet, error) {
rsList, err := ListReplicaSets(deployment, RsListFromClient(c))
if err != nil {
return nil, nil, nil, err
}
oldRSes, allOldRSes := FindOldReplicaSets(deployment, rsList)
newRS := FindNewReplicaSet(deployment, rsList)
return oldRSes, allOldRSes, newRS, nil
}
// GetOldReplicaSets returns the old replica sets targeted by the given Deployment; get PodList and ReplicaSetList from client interface.
// Note that the first set of old replica sets doesn't include the ones with no pods, and the second set of old replica sets include all old replica sets.
func GetOldReplicaSets(deployment *apps.Deployment, c appsclient.AppsV1Interface) ([]*apps.ReplicaSet, []*apps.ReplicaSet, error) {
rsList, err := ListReplicaSets(deployment, RsListFromClient(c))
if err != nil {
return nil, nil, err
}
oldRSes, allOldRSes := FindOldReplicaSets(deployment, rsList)
return oldRSes, allOldRSes, nil
}
// GetNewReplicaSet returns a replica set that matches the intent of the given deployment; get ReplicaSetList from client interface.
// Returns nil if the new replica set doesn't exist yet.
func GetNewReplicaSet(deployment *apps.Deployment, c appsclient.AppsV1Interface) (*apps.ReplicaSet, error) {
rsList, err := ListReplicaSets(deployment, RsListFromClient(c))
if err != nil {
return nil, err
}
return FindNewReplicaSet(deployment, rsList), nil
}
// RsListFromClient returns an rsListFunc that wraps the given client.
func RsListFromClient(c appsclient.AppsV1Interface) RsListFunc {
return func(namespace string, options metav1.ListOptions) ([]*apps.ReplicaSet, error) {
rsList, err := c.ReplicaSets(namespace).List(options)
if err != nil {
return nil, err
}
var ret []*apps.ReplicaSet
for i := range rsList.Items {
ret = append(ret, &rsList.Items[i])
}
return ret, err
}
}
// TODO: switch RsListFunc and podListFunc to full namespacers
// RsListFunc returns the ReplicaSet from the ReplicaSet namespace and the List metav1.ListOptions.
type RsListFunc func(string, metav1.ListOptions) ([]*apps.ReplicaSet, error)
// podListFunc returns the PodList from the Pod namespace and the List metav1.ListOptions.
type podListFunc func(string, metav1.ListOptions) (*v1.PodList, error)
// ListReplicaSets returns a slice of RSes the given deployment targets.
// Note that this does NOT attempt to reconcile ControllerRef (adopt/orphan),
// because only the controller itself should do that.
// However, it does filter out anything whose ControllerRef doesn't match.
func ListReplicaSets(deployment *apps.Deployment, getRSList RsListFunc) ([]*apps.ReplicaSet, error) {
// TODO: Right now we list replica sets by their labels. We should list them by selector, i.e. the replica set's selector
// should be a superset of the deployment's selector, see https://github.com/kubernetes/kubernetes/issues/19830.
namespace := deployment.Namespace
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
if err != nil {
return nil, err
}
options := metav1.ListOptions{LabelSelector: selector.String()}
all, err := getRSList(namespace, options)
if err != nil {
return nil, err
}
// Only include those whose ControllerRef matches the Deployment.
owned := make([]*apps.ReplicaSet, 0, len(all))
for _, rs := range all {
if metav1.IsControlledBy(rs, deployment) {
owned = append(owned, rs)
}
}
return owned, nil
}
// ListPods returns a list of pods the given deployment targets.
// This needs a list of ReplicaSets for the Deployment,
// which can be found with ListReplicaSets().
// Note that this does NOT attempt to reconcile ControllerRef (adopt/orphan),
// because only the controller itself should do that.
// However, it does filter out anything whose ControllerRef doesn't match.
func ListPods(deployment *apps.Deployment, rsList []*apps.ReplicaSet, getPodList podListFunc) (*v1.PodList, error) {
namespace := deployment.Namespace
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
if err != nil {
return nil, err
}
options := metav1.ListOptions{LabelSelector: selector.String()}
all, err := getPodList(namespace, options)
if err != nil {
return all, err
}
// Only include those whose ControllerRef points to a ReplicaSet that is in
// turn owned by this Deployment.
rsMap := make(map[types.UID]bool, len(rsList))
for _, rs := range rsList {
rsMap[rs.UID] = true
}
owned := &v1.PodList{Items: make([]v1.Pod, 0, len(all.Items))}
for i := range all.Items {
pod := &all.Items[i]
controllerRef := metav1.GetControllerOf(pod)
if controllerRef != nil && rsMap[controllerRef.UID] {
owned.Items = append(owned.Items, *pod)
}
}
return owned, nil
}
// EqualIgnoreHash returns true if two given podTemplateSpec are equal, ignoring the diff in value of Labels[pod-template-hash]
// We ignore pod-template-hash because:
// 1. The hash result would be different upon podTemplateSpec API changes
// (e.g. the addition of a new field will cause the hash code to change)
// 2. The deployment template won't have hash labels
func EqualIgnoreHash(template1, template2 *v1.PodTemplateSpec) bool {
t1Copy := template1.DeepCopy()
t2Copy := template2.DeepCopy()
// Remove hash labels from template.Labels before comparing
delete(t1Copy.Labels, apps.DefaultDeploymentUniqueLabelKey)
delete(t2Copy.Labels, apps.DefaultDeploymentUniqueLabelKey)
return apiequality.Semantic.DeepEqual(t1Copy, t2Copy)
}
// FindNewReplicaSet returns the new RS this given deployment targets (the one with the same pod template).
func FindNewReplicaSet(deployment *apps.Deployment, rsList []*apps.ReplicaSet) *apps.ReplicaSet {
sort.Sort(controller.ReplicaSetsByCreationTimestamp(rsList))
for i := range rsList {
if EqualIgnoreHash(&rsList[i].Spec.Template, &deployment.Spec.Template) {
// In rare cases, such as after cluster upgrades, Deployment may end up with
// having more than one new ReplicaSets that have the same template as its template,
// see https://github.com/kubernetes/kubernetes/issues/40415
// We deterministically choose the oldest new ReplicaSet.
return rsList[i]
}
}
// new ReplicaSet does not exist.
return nil
}
// FindOldReplicaSets returns the old replica sets targeted by the given Deployment, with the given slice of RSes.
// Note that the first set of old replica sets doesn't include the ones with no pods, and the second set of old replica sets include all old replica sets.
func FindOldReplicaSets(deployment *apps.Deployment, rsList []*apps.ReplicaSet) ([]*apps.ReplicaSet, []*apps.ReplicaSet) {
var requiredRSs []*apps.ReplicaSet
var allRSs []*apps.ReplicaSet
newRS := FindNewReplicaSet(deployment, rsList)
for _, rs := range rsList {
// Filter out new replica set
if newRS != nil && rs.UID == newRS.UID {
continue
}
allRSs = append(allRSs, rs)
if *(rs.Spec.Replicas) != 0 {
requiredRSs = append(requiredRSs, rs)
}
}
return requiredRSs, allRSs
}
// SetFromReplicaSetTemplate sets the desired PodTemplateSpec from a replica set template to the given deployment.
func SetFromReplicaSetTemplate(deployment *apps.Deployment, template v1.PodTemplateSpec) *apps.Deployment {
deployment.Spec.Template.ObjectMeta = template.ObjectMeta
deployment.Spec.Template.Spec = template.Spec
deployment.Spec.Template.ObjectMeta.Labels = labelsutil.CloneAndRemoveLabel(
deployment.Spec.Template.ObjectMeta.Labels,
apps.DefaultDeploymentUniqueLabelKey)
return deployment
}
// GetReplicaCountForReplicaSets returns the sum of Replicas of the given replica sets.
func GetReplicaCountForReplicaSets(replicaSets []*apps.ReplicaSet) int32 {
totalReplicas := int32(0)
for _, rs := range replicaSets {
if rs != nil {
totalReplicas += *(rs.Spec.Replicas)
}
}
return totalReplicas
}
// GetActualReplicaCountForReplicaSets returns the sum of actual replicas of the given replica sets.
func GetActualReplicaCountForReplicaSets(replicaSets []*apps.ReplicaSet) int32 {
totalActualReplicas := int32(0)
for _, rs := range replicaSets {
if rs != nil {
totalActualReplicas += rs.Status.Replicas
}
}
return totalActualReplicas
}
// GetReadyReplicaCountForReplicaSets returns the number of ready pods corresponding to the given replica sets.
func GetReadyReplicaCountForReplicaSets(replicaSets []*apps.ReplicaSet) int32 {
totalReadyReplicas := int32(0)
for _, rs := range replicaSets {
if rs != nil {
totalReadyReplicas += rs.Status.ReadyReplicas
}
}
return totalReadyReplicas
}
// GetAvailableReplicaCountForReplicaSets returns the number of available pods corresponding to the given replica sets.
func GetAvailableReplicaCountForReplicaSets(replicaSets []*apps.ReplicaSet) int32 {
totalAvailableReplicas := int32(0)
for _, rs := range replicaSets {
if rs != nil {
totalAvailableReplicas += rs.Status.AvailableReplicas
}
}
return totalAvailableReplicas
}
// IsRollingUpdate returns true if the strategy type is a rolling update.
func IsRollingUpdate(deployment *apps.Deployment) bool {
return deployment.Spec.Strategy.Type == apps.RollingUpdateDeploymentStrategyType
}
// DeploymentComplete considers a deployment to be complete once all of its desired replicas
// are updated and available, and no old pods are running.
func DeploymentComplete(deployment *apps.Deployment, newStatus *apps.DeploymentStatus) bool {
return newStatus.UpdatedReplicas == *(deployment.Spec.Replicas) &&
newStatus.Replicas == *(deployment.Spec.Replicas) &&
newStatus.AvailableReplicas == *(deployment.Spec.Replicas) &&
newStatus.ObservedGeneration >= deployment.Generation
}
// DeploymentProgressing reports progress for a deployment. Progress is estimated by comparing the
// current with the new status of the deployment that the controller is observing. More specifically,
// when new pods are scaled up or become ready or available, or old pods are scaled down, then we
// consider the deployment is progressing.
func DeploymentProgressing(deployment *apps.Deployment, newStatus *apps.DeploymentStatus) bool {
oldStatus := deployment.Status
// Old replicas that need to be scaled down
oldStatusOldReplicas := oldStatus.Replicas - oldStatus.UpdatedReplicas
newStatusOldReplicas := newStatus.Replicas - newStatus.UpdatedReplicas
return (newStatus.UpdatedReplicas > oldStatus.UpdatedReplicas) ||
(newStatusOldReplicas < oldStatusOldReplicas) ||
newStatus.ReadyReplicas > deployment.Status.ReadyReplicas ||
newStatus.AvailableReplicas > deployment.Status.AvailableReplicas
}
// used for unit testing
var nowFn = func() time.Time { return time.Now() }
// DeploymentTimedOut considers a deployment to have timed out once its condition that reports progress
// is older than progressDeadlineSeconds or a Progressing condition with a TimedOutReason reason already
// exists.
func DeploymentTimedOut(deployment *apps.Deployment, newStatus *apps.DeploymentStatus) bool {
if !HasProgressDeadline(deployment) {
return false
}
// Look for the Progressing condition. If it doesn't exist, we have no base to estimate progress.
// If it's already set with a TimedOutReason reason, we have already timed out, no need to check
// again.
condition := GetDeploymentCondition(*newStatus, apps.DeploymentProgressing)
if condition == nil {
return false
}
// If the previous condition has been a successful rollout then we shouldn't try to
// estimate any progress. Scenario:
//
// * progressDeadlineSeconds is smaller than the difference between now and the time
// the last rollout finished in the past.
// * the creation of a new ReplicaSet triggers a resync of the Deployment prior to the
// cached copy of the Deployment getting updated with the status.condition that indicates
// the creation of the new ReplicaSet.
//
// The Deployment will be resynced and eventually its Progressing condition will catch
// up with the state of the world.
if condition.Reason == NewRSAvailableReason {
return false
}
if condition.Reason == TimedOutReason {
return true
}
// Look at the difference in seconds between now and the last time we reported any
// progress or tried to create a replica set, or resumed a paused deployment and
// compare against progressDeadlineSeconds.
from := condition.LastUpdateTime
now := nowFn()
delta := time.Duration(*deployment.Spec.ProgressDeadlineSeconds) * time.Second
timedOut := from.Add(delta).Before(now)
klog.V(4).Infof("Deployment %q timed out (%t) [last progress check: %v - now: %v]", deployment.Name, timedOut, from, now)
return timedOut
}
// NewRSNewReplicas calculates the number of replicas a deployment's new RS should have.
// When one of the followings is true, we're rolling out the deployment; otherwise, we're scaling it.
// 1) The new RS is saturated: newRS's replicas == deployment's replicas
// 2) Max number of pods allowed is reached: deployment's replicas + maxSurge == all RSs' replicas
func NewRSNewReplicas(deployment *apps.Deployment, allRSs []*apps.ReplicaSet, newRS *apps.ReplicaSet) (int32, error) {
switch deployment.Spec.Strategy.Type {
case apps.RollingUpdateDeploymentStrategyType:
// Check if we can scale up.
maxSurge, err := intstrutil.GetValueFromIntOrPercent(deployment.Spec.Strategy.RollingUpdate.MaxSurge, int(*(deployment.Spec.Replicas)), true)
if err != nil {
return 0, err
}
// Find the total number of pods
currentPodCount := GetReplicaCountForReplicaSets(allRSs)
maxTotalPods := *(deployment.Spec.Replicas) + int32(maxSurge)
if currentPodCount >= maxTotalPods {
// Cannot scale up.
return *(newRS.Spec.Replicas), nil
}
// Scale up.
scaleUpCount := maxTotalPods - currentPodCount
// Do not exceed the number of desired replicas.
scaleUpCount = int32(integer.IntMin(int(scaleUpCount), int(*(deployment.Spec.Replicas)-*(newRS.Spec.Replicas))))
return *(newRS.Spec.Replicas) + scaleUpCount, nil
case apps.RecreateDeploymentStrategyType:
return *(deployment.Spec.Replicas), nil
default:
return 0, fmt.Errorf("deployment type %v isn't supported", deployment.Spec.Strategy.Type)
}
}
// IsSaturated checks if the new replica set is saturated by comparing its size with its deployment size.
// Both the deployment and the replica set have to believe this replica set can own all of the desired
// replicas in the deployment and the annotation helps in achieving that. All pods of the ReplicaSet
// need to be available.
func IsSaturated(deployment *apps.Deployment, rs *apps.ReplicaSet) bool {
if rs == nil {
return false
}
desiredString := rs.Annotations[DesiredReplicasAnnotation]
desired, err := strconv.Atoi(desiredString)
if err != nil {
return false
}
return *(rs.Spec.Replicas) == *(deployment.Spec.Replicas) &&
int32(desired) == *(deployment.Spec.Replicas) &&
rs.Status.AvailableReplicas == *(deployment.Spec.Replicas)
}
// WaitForObservedDeployment polls for deployment to be updated so that deployment.Status.ObservedGeneration >= desiredGeneration.
// Returns error if polling timesout.
func WaitForObservedDeployment(getDeploymentFunc func() (*apps.Deployment, error), desiredGeneration int64, interval, timeout time.Duration) error {
// TODO: This should take clientset.Interface when all code is updated to use clientset. Keeping it this way allows the function to be used by callers who have client.Interface.
return wait.PollImmediate(interval, timeout, func() (bool, error) {
deployment, err := getDeploymentFunc()
if err != nil {
return false, err
}
return deployment.Status.ObservedGeneration >= desiredGeneration, nil
})
}
// ResolveFenceposts resolves both maxSurge and maxUnavailable. This needs to happen in one
// step. For example:
//
// 2 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1), then old(-1), then new(+1)
// 1 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1)
// 2 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1)
// 1 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1)
// 2 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1)
// 1 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1)
func ResolveFenceposts(maxSurge, maxUnavailable *intstrutil.IntOrString, desired int32) (int32, int32, error) {
surge, err := intstrutil.GetValueFromIntOrPercent(intstrutil.ValueOrDefault(maxSurge, intstrutil.FromInt(0)), int(desired), true)
if err != nil {
return 0, 0, err
}
unavailable, err := intstrutil.GetValueFromIntOrPercent(intstrutil.ValueOrDefault(maxUnavailable, intstrutil.FromInt(0)), int(desired), false)
if err != nil {
return 0, 0, err
}
if surge == 0 && unavailable == 0 {
// Validation should never allow the user to explicitly use zero values for both maxSurge
// maxUnavailable. Due to rounding down maxUnavailable though, it may resolve to zero.
// If both fenceposts resolve to zero, then we should set maxUnavailable to 1 on the
// theory that surge might not work due to quota.
unavailable = 1
}
return int32(surge), int32(unavailable), nil
}
// HasProgressDeadline checks if the Deployment d is expected to surface the reason
// "ProgressDeadlineExceeded" when the Deployment progress takes longer than expected time.
func HasProgressDeadline(d *apps.Deployment) bool {
return d.Spec.ProgressDeadlineSeconds != nil && *d.Spec.ProgressDeadlineSeconds != math.MaxInt32
}
// HasRevisionHistoryLimit checks if the Deployment d is expected to keep a specified number of
// old replicaSets. These replicaSets are mainly kept with the purpose of rollback.
// The RevisionHistoryLimit can start from 0 (no retained replicasSet). When set to math.MaxInt32,
// the Deployment will keep all revisions.
func HasRevisionHistoryLimit(d *apps.Deployment) bool {
return d.Spec.RevisionHistoryLimit != nil && *d.Spec.RevisionHistoryLimit != math.MaxInt32
}

19
vendor/k8s.io/kubernetes/pkg/controller/doc.go generated vendored Normal file
View File

@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package controller contains code for controllers (like the replication
// controller).
package controller // import "k8s.io/kubernetes/pkg/controller"

18
vendor/k8s.io/kubernetes/pkg/controller/job/doc.go generated vendored Normal file
View File

@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package job contains logic for watching and synchronizing jobs.
package job // import "k8s.io/kubernetes/pkg/controller/job"

View File

@ -0,0 +1,871 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
"fmt"
"math"
"reflect"
"sort"
"sync"
"time"
batch "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
batchinformers "k8s.io/client-go/informers/batch/v1"
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
batchv1listers "k8s.io/client-go/listers/batch/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/util/metrics"
"k8s.io/utils/integer"
"k8s.io/klog"
)
const statusUpdateRetries = 3
// controllerKind contains the schema.GroupVersionKind for this controller type.
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
var (
// DefaultJobBackOff is the max backoff period, exported for the e2e test
DefaultJobBackOff = 10 * time.Second
// MaxJobBackOff is the max backoff period, exported for the e2e test
MaxJobBackOff = 360 * time.Second
)
type JobController struct {
kubeClient clientset.Interface
podControl controller.PodControlInterface
// To allow injection of updateJobStatus for testing.
updateHandler func(job *batch.Job) error
syncHandler func(jobKey string) (bool, error)
// podStoreSynced returns true if the pod store has been synced at least once.
// Added as a member to the struct to allow injection for testing.
podStoreSynced cache.InformerSynced
// jobStoreSynced returns true if the job store has been synced at least once.
// Added as a member to the struct to allow injection for testing.
jobStoreSynced cache.InformerSynced
// A TTLCache of pod creates/deletes each rc expects to see
expectations controller.ControllerExpectationsInterface
// A store of jobs
jobLister batchv1listers.JobLister
// A store of pods, populated by the podController
podStore corelisters.PodLister
// Jobs that need to be updated
queue workqueue.RateLimitingInterface
recorder record.EventRecorder
}
func NewJobController(podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface) *JobController {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(klog.Infof)
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
if kubeClient != nil && kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
metrics.RegisterMetricAndTrackRateLimiterUsage("job_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter())
}
jm := &JobController{
kubeClient: kubeClient,
podControl: controller.RealPodControl{
KubeClient: kubeClient,
Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}),
},
expectations: controller.NewControllerExpectations(),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultJobBackOff, MaxJobBackOff), "job"),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}),
}
jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
jm.enqueueController(obj, true)
},
UpdateFunc: jm.updateJob,
DeleteFunc: func(obj interface{}) {
jm.enqueueController(obj, true)
},
})
jm.jobLister = jobInformer.Lister()
jm.jobStoreSynced = jobInformer.Informer().HasSynced
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: jm.addPod,
UpdateFunc: jm.updatePod,
DeleteFunc: jm.deletePod,
})
jm.podStore = podInformer.Lister()
jm.podStoreSynced = podInformer.Informer().HasSynced
jm.updateHandler = jm.updateJobStatus
jm.syncHandler = jm.syncJob
return jm
}
// Run the main goroutine responsible for watching and syncing jobs.
func (jm *JobController) Run(workers int, stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer jm.queue.ShutDown()
klog.Infof("Starting job controller")
defer klog.Infof("Shutting down job controller")
if !controller.WaitForCacheSync("job", stopCh, jm.podStoreSynced, jm.jobStoreSynced) {
return
}
for i := 0; i < workers; i++ {
go wait.Until(jm.worker, time.Second, stopCh)
}
<-stopCh
}
// getPodJobs returns a list of Jobs that potentially match a Pod.
func (jm *JobController) getPodJobs(pod *v1.Pod) []*batch.Job {
jobs, err := jm.jobLister.GetPodJobs(pod)
if err != nil {
return nil
}
if len(jobs) > 1 {
// ControllerRef will ensure we don't do anything crazy, but more than one
// item in this list nevertheless constitutes user error.
utilruntime.HandleError(fmt.Errorf("user error! more than one job is selecting pods with labels: %+v", pod.Labels))
}
ret := make([]*batch.Job, 0, len(jobs))
for i := range jobs {
ret = append(ret, &jobs[i])
}
return ret
}
// resolveControllerRef returns the controller referenced by a ControllerRef,
// or nil if the ControllerRef could not be resolved to a matching controller
// of the correct Kind.
func (jm *JobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *batch.Job {
// We can't look up by UID, so look up by Name and then verify UID.
// Don't even try to look up by Name if it's the wrong Kind.
if controllerRef.Kind != controllerKind.Kind {
return nil
}
job, err := jm.jobLister.Jobs(namespace).Get(controllerRef.Name)
if err != nil {
return nil
}
if job.UID != controllerRef.UID {
// The controller we found with this Name is not the same one that the
// ControllerRef points to.
return nil
}
return job
}
// When a pod is created, enqueue the controller that manages it and update it's expectations.
func (jm *JobController) addPod(obj interface{}) {
pod := obj.(*v1.Pod)
if pod.DeletionTimestamp != nil {
// on a restart of the controller controller, it's possible a new pod shows up in a state that
// is already pending deletion. Prevent the pod from being a creation observation.
jm.deletePod(pod)
return
}
// If it has a ControllerRef, that's all that matters.
if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
job := jm.resolveControllerRef(pod.Namespace, controllerRef)
if job == nil {
return
}
jobKey, err := controller.KeyFunc(job)
if err != nil {
return
}
jm.expectations.CreationObserved(jobKey)
jm.enqueueController(job, true)
return
}
// Otherwise, it's an orphan. Get a list of all matching controllers and sync
// them to see if anyone wants to adopt it.
// DO NOT observe creation because no controller should be waiting for an
// orphan.
for _, job := range jm.getPodJobs(pod) {
jm.enqueueController(job, true)
}
}
// When a pod is updated, figure out what job/s manage it and wake them up.
// If the labels of the pod have changed we need to awaken both the old
// and new job. old and cur must be *v1.Pod types.
func (jm *JobController) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod)
if curPod.ResourceVersion == oldPod.ResourceVersion {
// Periodic resync will send update events for all known pods.
// Two different versions of the same pod will always have different RVs.
return
}
if curPod.DeletionTimestamp != nil {
// when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period,
// and after such time has passed, the kubelet actually deletes it from the store. We receive an update
// for modification of the deletion timestamp and expect an job to create more pods asap, not wait
// until the kubelet actually deletes the pod.
jm.deletePod(curPod)
return
}
// the only time we want the backoff to kick-in, is when the pod failed
immediate := curPod.Status.Phase != v1.PodFailed
curControllerRef := metav1.GetControllerOf(curPod)
oldControllerRef := metav1.GetControllerOf(oldPod)
controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
if controllerRefChanged && oldControllerRef != nil {
// The ControllerRef was changed. Sync the old controller, if any.
if job := jm.resolveControllerRef(oldPod.Namespace, oldControllerRef); job != nil {
jm.enqueueController(job, immediate)
}
}
// If it has a ControllerRef, that's all that matters.
if curControllerRef != nil {
job := jm.resolveControllerRef(curPod.Namespace, curControllerRef)
if job == nil {
return
}
jm.enqueueController(job, immediate)
return
}
// Otherwise, it's an orphan. If anything changed, sync matching controllers
// to see if anyone wants to adopt it now.
labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
if labelChanged || controllerRefChanged {
for _, job := range jm.getPodJobs(curPod) {
jm.enqueueController(job, immediate)
}
}
}
// When a pod is deleted, enqueue the job that manages the pod and update its expectations.
// obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.
func (jm *JobController) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
// When a delete is dropped, the relist will notice a pod in the store not
// in the list, leading to the insertion of a tombstone object which contains
// the deleted key/value. Note that this value might be stale. If the pod
// changed labels the new job will not be woken up till the periodic resync.
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
return
}
pod, ok = tombstone.Obj.(*v1.Pod)
if !ok {
utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj))
return
}
}
controllerRef := metav1.GetControllerOf(pod)
if controllerRef == nil {
// No controller should care about orphans being deleted.
return
}
job := jm.resolveControllerRef(pod.Namespace, controllerRef)
if job == nil {
return
}
jobKey, err := controller.KeyFunc(job)
if err != nil {
return
}
jm.expectations.DeletionObserved(jobKey)
jm.enqueueController(job, true)
}
func (jm *JobController) updateJob(old, cur interface{}) {
oldJob := old.(*batch.Job)
curJob := cur.(*batch.Job)
// never return error
key, err := controller.KeyFunc(curJob)
if err != nil {
return
}
jm.enqueueController(curJob, true)
// check if need to add a new rsync for ActiveDeadlineSeconds
if curJob.Status.StartTime != nil {
curADS := curJob.Spec.ActiveDeadlineSeconds
if curADS == nil {
return
}
oldADS := oldJob.Spec.ActiveDeadlineSeconds
if oldADS == nil || *oldADS != *curADS {
now := metav1.Now()
start := curJob.Status.StartTime.Time
passed := now.Time.Sub(start)
total := time.Duration(*curADS) * time.Second
// AddAfter will handle total < passed
jm.queue.AddAfter(key, total-passed)
klog.V(4).Infof("job ActiveDeadlineSeconds updated, will rsync after %d seconds", total-passed)
}
}
}
// obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item,
// immediate tells the controller to update the status right away, and should
// happen ONLY when there was a successful pod run.
func (jm *JobController) enqueueController(obj interface{}, immediate bool) {
key, err := controller.KeyFunc(obj)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
return
}
backoff := time.Duration(0)
if !immediate {
backoff = getBackoff(jm.queue, key)
}
// TODO: Handle overlapping controllers better. Either disallow them at admission time or
// deterministically avoid syncing controllers that fight over pods. Currently, we only
// ensure that the same controller is synced for a given pod. When we periodically relist
// all controllers there will still be some replica instability. One way to handle this is
// by querying the store for all controllers that this rc overlaps, as well as all
// controllers that overlap this rc, and sorting them.
jm.queue.AddAfter(key, backoff)
}
// worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key.
func (jm *JobController) worker() {
for jm.processNextWorkItem() {
}
}
func (jm *JobController) processNextWorkItem() bool {
key, quit := jm.queue.Get()
if quit {
return false
}
defer jm.queue.Done(key)
forget, err := jm.syncHandler(key.(string))
if err == nil {
if forget {
jm.queue.Forget(key)
}
return true
}
utilruntime.HandleError(fmt.Errorf("Error syncing job: %v", err))
jm.queue.AddRateLimited(key)
return true
}
// getPodsForJob returns the set of pods that this Job should manage.
// It also reconciles ControllerRef by adopting/orphaning.
// Note that the returned Pods are pointers into the cache.
func (jm *JobController) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
selector, err := metav1.LabelSelectorAsSelector(j.Spec.Selector)
if err != nil {
return nil, fmt.Errorf("couldn't convert Job selector: %v", err)
}
// List all pods to include those that don't match the selector anymore
// but have a ControllerRef pointing to this controller.
pods, err := jm.podStore.Pods(j.Namespace).List(labels.Everything())
if err != nil {
return nil, err
}
// If any adoptions are attempted, we should first recheck for deletion
// with an uncached quorum read sometime after listing Pods (see #42639).
canAdoptFunc := controller.RecheckDeletionTimestamp(func() (metav1.Object, error) {
fresh, err := jm.kubeClient.BatchV1().Jobs(j.Namespace).Get(j.Name, metav1.GetOptions{})
if err != nil {
return nil, err
}
if fresh.UID != j.UID {
return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", j.Namespace, j.Name, fresh.UID, j.UID)
}
return fresh, nil
})
cm := controller.NewPodControllerRefManager(jm.podControl, j, selector, controllerKind, canAdoptFunc)
return cm.ClaimPods(pods)
}
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key.
func (jm *JobController) syncJob(key string) (bool, error) {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
}()
ns, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return false, err
}
if len(ns) == 0 || len(name) == 0 {
return false, fmt.Errorf("invalid job key %q: either namespace or name is missing", key)
}
sharedJob, err := jm.jobLister.Jobs(ns).Get(name)
if err != nil {
if errors.IsNotFound(err) {
klog.V(4).Infof("Job has been deleted: %v", key)
jm.expectations.DeleteExpectations(key)
return true, nil
}
return false, err
}
job := *sharedJob
// if job was finished previously, we don't want to redo the termination
if IsJobFinished(&job) {
return true, nil
}
// retrieve the previous number of retry
previousRetry := jm.queue.NumRequeues(key)
// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
// the store after we've checked the expectation, the job sync is just deferred till the next relist.
jobNeedsSync := jm.expectations.SatisfiedExpectations(key)
pods, err := jm.getPodsForJob(&job)
if err != nil {
return false, err
}
activePods := controller.FilterActivePods(pods)
active := int32(len(activePods))
succeeded, failed := getStatus(pods)
conditions := len(job.Status.Conditions)
// job first start
if job.Status.StartTime == nil {
now := metav1.Now()
job.Status.StartTime = &now
// enqueue a sync to check if job past ActiveDeadlineSeconds
if job.Spec.ActiveDeadlineSeconds != nil {
klog.V(4).Infof("Job %s have ActiveDeadlineSeconds will sync after %d seconds",
key, *job.Spec.ActiveDeadlineSeconds)
jm.queue.AddAfter(key, time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second)
}
}
var manageJobErr error
jobFailed := false
var failureReason string
var failureMessage string
jobHaveNewFailure := failed > job.Status.Failed
// new failures happen when status does not reflect the failures and active
// is different than parallelism, otherwise the previous controller loop
// failed updating status so even if we pick up failure it is not a new one
exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) &&
(int32(previousRetry)+1 > *job.Spec.BackoffLimit)
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
// OR if the number of failed jobs increased since the last syncJob
jobFailed = true
failureReason = "BackoffLimitExceeded"
failureMessage = "Job has reached the specified backoff limit"
} else if pastActiveDeadline(&job) {
jobFailed = true
failureReason = "DeadlineExceeded"
failureMessage = "Job was active longer than specified deadline"
}
if jobFailed {
errCh := make(chan error, active)
jm.deleteJobPods(&job, activePods, errCh)
select {
case manageJobErr = <-errCh:
if manageJobErr != nil {
break
}
default:
}
// update status values accordingly
failed += active
active = 0
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, failureReason, failureMessage))
jm.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
} else {
if jobNeedsSync && job.DeletionTimestamp == nil {
active, manageJobErr = jm.manageJob(activePods, succeeded, &job)
}
completions := succeeded
complete := false
if job.Spec.Completions == nil {
// This type of job is complete when any pod exits with success.
// Each pod is capable of
// determining whether or not the entire Job is done. Subsequent pods are
// not expected to fail, but if they do, the failure is ignored. Once any
// pod succeeds, the controller waits for remaining pods to finish, and
// then the job is complete.
if succeeded > 0 && active == 0 {
complete = true
}
} else {
// Job specifies a number of completions. This type of job signals
// success by having that number of successes. Since we do not
// start more pods than there are remaining completions, there should
// not be any remaining active pods once this count is reached.
if completions >= *job.Spec.Completions {
complete = true
if active > 0 {
jm.recorder.Event(&job, v1.EventTypeWarning, "TooManyActivePods", "Too many active pods running after completion count reached")
}
if completions > *job.Spec.Completions {
jm.recorder.Event(&job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached")
}
}
}
if complete {
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobComplete, "", ""))
now := metav1.Now()
job.Status.CompletionTime = &now
}
}
forget := false
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
// In this case, we should clear the backoff delay.
if job.Status.Succeeded < succeeded {
forget = true
}
// no need to update the job if the status hasn't changed since last time
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
job.Status.Active = active
job.Status.Succeeded = succeeded
job.Status.Failed = failed
if err := jm.updateHandler(&job); err != nil {
return forget, err
}
if jobHaveNewFailure && !IsJobFinished(&job) {
// returning an error will re-enqueue Job after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for job key %q", key)
}
forget = true
}
return forget, manageJobErr
}
func (jm *JobController) deleteJobPods(job *batch.Job, pods []*v1.Pod, errCh chan<- error) {
// TODO: below code should be replaced with pod termination resulting in
// pod failures, rather than killing pods. Unfortunately none such solution
// exists ATM. There's an open discussion in the topic in
// https://github.com/kubernetes/kubernetes/issues/14602 which might give
// some sort of solution to above problem.
// kill remaining active pods
wait := sync.WaitGroup{}
nbPods := len(pods)
wait.Add(nbPods)
for i := int32(0); i < int32(nbPods); i++ {
go func(ix int32) {
defer wait.Done()
if err := jm.podControl.DeletePod(job.Namespace, pods[ix].Name, job); err != nil {
defer utilruntime.HandleError(err)
klog.V(2).Infof("Failed to delete %v, job %q/%q deadline exceeded", pods[ix].Name, job.Namespace, job.Name)
errCh <- err
}
}(i)
}
wait.Wait()
}
// pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit
// this method applies only to pods with restartPolicy == OnFailure
func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool {
if job.Spec.Template.Spec.RestartPolicy != v1.RestartPolicyOnFailure {
return false
}
result := int32(0)
for i := range pods {
po := pods[i]
if po.Status.Phase != v1.PodRunning {
continue
}
for j := range po.Status.InitContainerStatuses {
stat := po.Status.InitContainerStatuses[j]
result += stat.RestartCount
}
for j := range po.Status.ContainerStatuses {
stat := po.Status.ContainerStatuses[j]
result += stat.RestartCount
}
}
if *job.Spec.BackoffLimit == 0 {
return result > 0
}
return result >= *job.Spec.BackoffLimit
}
// pastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if it is exceeded.
func pastActiveDeadline(job *batch.Job) bool {
if job.Spec.ActiveDeadlineSeconds == nil || job.Status.StartTime == nil {
return false
}
now := metav1.Now()
start := job.Status.StartTime.Time
duration := now.Time.Sub(start)
allowedDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second
return duration >= allowedDuration
}
func newCondition(conditionType batch.JobConditionType, reason, message string) batch.JobCondition {
return batch.JobCondition{
Type: conditionType,
Status: v1.ConditionTrue,
LastProbeTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
}
}
// getStatus returns no of succeeded and failed pods running a job
func getStatus(pods []*v1.Pod) (succeeded, failed int32) {
succeeded = int32(filterPods(pods, v1.PodSucceeded))
failed = int32(filterPods(pods, v1.PodFailed))
return
}
// manageJob is the core method responsible for managing the number of running
// pods according to what is specified in the job.Spec.
// Does NOT modify <activePods>.
func (jm *JobController) manageJob(activePods []*v1.Pod, succeeded int32, job *batch.Job) (int32, error) {
var activeLock sync.Mutex
active := int32(len(activePods))
parallelism := *job.Spec.Parallelism
jobKey, err := controller.KeyFunc(job)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for job %#v: %v", job, err))
return 0, nil
}
var errCh chan error
if active > parallelism {
diff := active - parallelism
errCh = make(chan error, diff)
jm.expectations.ExpectDeletions(jobKey, int(diff))
klog.V(4).Infof("Too many pods running job %q, need %d, deleting %d", jobKey, parallelism, diff)
// Sort the pods in the order such that not-ready < ready, unscheduled
// < scheduled, and pending < running. This ensures that we delete pods
// in the earlier stages whenever possible.
sort.Sort(controller.ActivePods(activePods))
active -= diff
wait := sync.WaitGroup{}
wait.Add(int(diff))
for i := int32(0); i < diff; i++ {
go func(ix int32) {
defer wait.Done()
if err := jm.podControl.DeletePod(job.Namespace, activePods[ix].Name, job); err != nil {
defer utilruntime.HandleError(err)
// Decrement the expected number of deletes because the informer won't observe this deletion
klog.V(2).Infof("Failed to delete %v, decrementing expectations for job %q/%q", activePods[ix].Name, job.Namespace, job.Name)
jm.expectations.DeletionObserved(jobKey)
activeLock.Lock()
active++
activeLock.Unlock()
errCh <- err
}
}(i)
}
wait.Wait()
} else if active < parallelism {
wantActive := int32(0)
if job.Spec.Completions == nil {
// Job does not specify a number of completions. Therefore, number active
// should be equal to parallelism, unless the job has seen at least
// once success, in which leave whatever is running, running.
if succeeded > 0 {
wantActive = active
} else {
wantActive = parallelism
}
} else {
// Job specifies a specific number of completions. Therefore, number
// active should not ever exceed number of remaining completions.
wantActive = *job.Spec.Completions - succeeded
if wantActive > parallelism {
wantActive = parallelism
}
}
diff := wantActive - active
if diff < 0 {
utilruntime.HandleError(fmt.Errorf("More active than wanted: job %q, want %d, have %d", jobKey, wantActive, active))
diff = 0
}
jm.expectations.ExpectCreations(jobKey, int(diff))
errCh = make(chan error, diff)
klog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)
active += diff
wait := sync.WaitGroup{}
// Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize
// and double with each successful iteration in a kind of "slow start".
// This handles attempts to start large numbers of pods that would
// likely all fail with the same error. For example a project with a
// low quota that attempts to create a large number of pods will be
// prevented from spamming the API service with the pod create requests
// after one of its pods fails. Conveniently, this also prevents the
// event spam that those failures would generate.
for batchSize := int32(integer.IntMin(int(diff), controller.SlowStartInitialBatchSize)); diff > 0; batchSize = integer.Int32Min(2*batchSize, diff) {
errorCount := len(errCh)
wait.Add(int(batchSize))
for i := int32(0); i < batchSize; i++ {
go func() {
defer wait.Done()
err := jm.podControl.CreatePodsWithControllerRef(job.Namespace, &job.Spec.Template, job, metav1.NewControllerRef(job, controllerKind))
if err != nil && errors.IsTimeout(err) {
// Pod is created but its initialization has timed out.
// If the initialization is successful eventually, the
// controller will observe the creation via the informer.
// If the initialization fails, or if the pod keeps
// uninitialized for a long time, the informer will not
// receive any update, and the controller will create a new
// pod when the expectation expires.
return
}
if err != nil {
defer utilruntime.HandleError(err)
// Decrement the expected number of creates because the informer won't observe this pod
klog.V(2).Infof("Failed creation, decrementing expectations for job %q/%q", job.Namespace, job.Name)
jm.expectations.CreationObserved(jobKey)
activeLock.Lock()
active--
activeLock.Unlock()
errCh <- err
}
}()
}
wait.Wait()
// any skipped pods that we never attempted to start shouldn't be expected.
skippedPods := diff - batchSize
if errorCount < len(errCh) && skippedPods > 0 {
klog.V(2).Infof("Slow-start failure. Skipping creation of %d pods, decrementing expectations for job %q/%q", skippedPods, job.Namespace, job.Name)
active -= skippedPods
for i := int32(0); i < skippedPods; i++ {
// Decrement the expected number of creates because the informer won't observe this pod
jm.expectations.CreationObserved(jobKey)
}
// The skipped pods will be retried later. The next controller resync will
// retry the slow start process.
break
}
diff -= batchSize
}
}
select {
case err := <-errCh:
// all errors have been reported before, we only need to inform the controller that there was an error and it should re-try this job once more next time.
if err != nil {
return active, err
}
default:
}
return active, nil
}
func (jm *JobController) updateJobStatus(job *batch.Job) error {
jobClient := jm.kubeClient.BatchV1().Jobs(job.Namespace)
var err error
for i := 0; i <= statusUpdateRetries; i = i + 1 {
var newJob *batch.Job
newJob, err = jobClient.Get(job.Name, metav1.GetOptions{})
if err != nil {
break
}
newJob.Status = job.Status
if _, err = jobClient.UpdateStatus(newJob); err == nil {
break
}
}
return err
}
func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration {
exp := queue.NumRequeues(key)
if exp <= 0 {
return time.Duration(0)
}
// The backoff is capped such that 'calculated' value never overflows.
backoff := float64(DefaultJobBackOff.Nanoseconds()) * math.Pow(2, float64(exp-1))
if backoff > math.MaxInt64 {
return MaxJobBackOff
}
calculated := time.Duration(backoff)
if calculated > MaxJobBackOff {
return MaxJobBackOff
}
return calculated
}
// filterPods returns pods based on their phase.
func filterPods(pods []*v1.Pod, phase v1.PodPhase) int {
result := 0
for i := range pods {
if phase == pods[i].Status.Phase {
result++
}
}
return result
}

31
vendor/k8s.io/kubernetes/pkg/controller/job/utils.go generated vendored Normal file
View File

@ -0,0 +1,31 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package job
import (
batch "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
)
func IsJobFinished(j *batch.Job) bool {
for _, c := range j.Status.Conditions {
if (c.Type == batch.JobComplete || c.Type == batch.JobFailed) && c.Status == v1.ConditionTrue {
return true
}
}
return false
}

View File

@ -0,0 +1,92 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"hash/fnv"
"sync"
"github.com/golang/groupcache/lru"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
hashutil "k8s.io/kubernetes/pkg/util/hash"
)
type objectWithMeta interface {
metav1.Object
}
// keyFunc returns the key of an object, which is used to look up in the cache for it's matching object.
// Since we match objects by namespace and Labels/Selector, so if two objects have the same namespace and labels,
// they will have the same key.
func keyFunc(obj objectWithMeta) uint64 {
hash := fnv.New32a()
hashutil.DeepHashObject(hash, &equivalenceLabelObj{
namespace: obj.GetNamespace(),
labels: obj.GetLabels(),
})
return uint64(hash.Sum32())
}
type equivalenceLabelObj struct {
namespace string
labels map[string]string
}
// MatchingCache save label and selector matching relationship
type MatchingCache struct {
mutex sync.RWMutex
cache *lru.Cache
}
// NewMatchingCache return a NewMatchingCache, which save label and selector matching relationship.
func NewMatchingCache(maxCacheEntries int) *MatchingCache {
return &MatchingCache{
cache: lru.New(maxCacheEntries),
}
}
// Add will add matching information to the cache.
func (c *MatchingCache) Add(labelObj objectWithMeta, selectorObj objectWithMeta) {
key := keyFunc(labelObj)
c.mutex.Lock()
defer c.mutex.Unlock()
c.cache.Add(key, selectorObj)
}
// GetMatchingObject lookup the matching object for a given object.
// Note: the cache information may be invalid since the controller may be deleted or updated,
// we need check in the external request to ensure the cache data is not dirty.
func (c *MatchingCache) GetMatchingObject(labelObj objectWithMeta) (controller interface{}, exists bool) {
key := keyFunc(labelObj)
// NOTE: we use Lock() instead of RLock() here because lru's Get() method also modifies state(
// it need update the least recently usage information). So we can not call it concurrently.
c.mutex.Lock()
defer c.mutex.Unlock()
return c.cache.Get(key)
}
// Update update the cached matching information.
func (c *MatchingCache) Update(labelObj objectWithMeta, selectorObj objectWithMeta) {
c.Add(labelObj, selectorObj)
}
// InvalidateAll invalidate the whole cache.
func (c *MatchingCache) InvalidateAll() {
c.mutex.Lock()
defer c.mutex.Unlock()
c.cache = lru.New(c.cache.MaxEntries)
}

View File

@ -0,0 +1,78 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodelifecycle
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
)
const (
nodeControllerSubsystem = "node_collector"
zoneHealthStatisticKey = "zone_health"
zoneSizeKey = "zone_size"
zoneNoUnhealthyNodesKey = "unhealthy_nodes_in_zone"
evictionsNumberKey = "evictions_number"
)
var (
zoneHealth = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneHealthStatisticKey,
Help: "Gauge measuring percentage of healthy nodes per zone.",
},
[]string{"zone"},
)
zoneSize = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneSizeKey,
Help: "Gauge measuring number of registered Nodes per zones.",
},
[]string{"zone"},
)
unhealthyNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: nodeControllerSubsystem,
Name: zoneNoUnhealthyNodesKey,
Help: "Gauge measuring number of not Ready Nodes per zones.",
},
[]string{"zone"},
)
evictionsNumber = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: nodeControllerSubsystem,
Name: evictionsNumberKey,
Help: "Number of Node evictions that happened since current instance of NodeController started.",
},
[]string{"zone"},
)
)
var registerMetrics sync.Once
// Register the metrics that are to be monitored.
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(zoneHealth)
prometheus.MustRegister(zoneSize)
prometheus.MustRegister(unhealthyNodes)
prometheus.MustRegister(evictionsNumber)
})
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,309 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"container/heap"
"sync"
"time"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/util/flowcontrol"
"k8s.io/klog"
)
const (
// NodeHealthUpdateRetry controls the number of retries of writing
// node health update.
NodeHealthUpdateRetry = 5
// NodeEvictionPeriod controls how often NodeController will try to
// evict Pods from non-responsive Nodes.
NodeEvictionPeriod = 100 * time.Millisecond
// EvictionRateLimiterBurst is the burst value for all eviction rate
// limiters
EvictionRateLimiterBurst = 1
)
// TimedValue is a value that should be processed at a designated time.
type TimedValue struct {
Value string
// UID could be anything that helps identify the value
UID interface{}
AddedAt time.Time
ProcessAt time.Time
}
// now is used to test time
var now = time.Now
// TimedQueue is a priority heap where the lowest ProcessAt is at the front of the queue
type TimedQueue []*TimedValue
// Len is the length of the queue.
func (h TimedQueue) Len() int { return len(h) }
// Less returns true if queue[i] < queue[j].
func (h TimedQueue) Less(i, j int) bool { return h[i].ProcessAt.Before(h[j].ProcessAt) }
// Swap swaps index i and j.
func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
// Push a new TimedValue on to the queue.
func (h *TimedQueue) Push(x interface{}) {
*h = append(*h, x.(*TimedValue))
}
// Pop the lowest ProcessAt item.
func (h *TimedQueue) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// UniqueQueue is a FIFO queue which additionally guarantees that any
// element can be added only once until it is removed.
type UniqueQueue struct {
lock sync.Mutex
queue TimedQueue
set sets.String
}
// Add a new value to the queue if it wasn't added before, or was
// explicitly removed by the Remove call. Returns true if new value
// was added.
func (q *UniqueQueue) Add(value TimedValue) bool {
q.lock.Lock()
defer q.lock.Unlock()
if q.set.Has(value.Value) {
return false
}
heap.Push(&q.queue, &value)
q.set.Insert(value.Value)
return true
}
// Replace replaces an existing value in the queue if it already
// exists, otherwise it does nothing. Returns true if the item was
// found.
func (q *UniqueQueue) Replace(value TimedValue) bool {
q.lock.Lock()
defer q.lock.Unlock()
for i := range q.queue {
if q.queue[i].Value != value.Value {
continue
}
heap.Remove(&q.queue, i)
heap.Push(&q.queue, &value)
return true
}
return false
}
// RemoveFromQueue the value from the queue, but keeps it in the set,
// so it won't be added second time. Returns true if something was
// removed.
func (q *UniqueQueue) RemoveFromQueue(value string) bool {
q.lock.Lock()
defer q.lock.Unlock()
if !q.set.Has(value) {
return false
}
for i, val := range q.queue {
if val.Value == value {
heap.Remove(&q.queue, i)
return true
}
}
return false
}
// Remove the value from the queue, so Get() call won't return it, and
// allow subsequent addition of the given value. If the value is not
// present does nothing and returns false.
func (q *UniqueQueue) Remove(value string) bool {
q.lock.Lock()
defer q.lock.Unlock()
if !q.set.Has(value) {
return false
}
q.set.Delete(value)
for i, val := range q.queue {
if val.Value == value {
heap.Remove(&q.queue, i)
return true
}
}
return true
}
// Get returns the oldest added value that wasn't returned yet.
func (q *UniqueQueue) Get() (TimedValue, bool) {
q.lock.Lock()
defer q.lock.Unlock()
if len(q.queue) == 0 {
return TimedValue{}, false
}
result := heap.Pop(&q.queue).(*TimedValue)
q.set.Delete(result.Value)
return *result, true
}
// Head returns the oldest added value that wasn't returned yet
// without removing it.
func (q *UniqueQueue) Head() (TimedValue, bool) {
q.lock.Lock()
defer q.lock.Unlock()
if len(q.queue) == 0 {
return TimedValue{}, false
}
result := q.queue[0]
return *result, true
}
// Clear removes all items from the queue and duplication preventing
// set.
func (q *UniqueQueue) Clear() {
q.lock.Lock()
defer q.lock.Unlock()
if q.queue.Len() > 0 {
q.queue = make(TimedQueue, 0)
}
if len(q.set) > 0 {
q.set = sets.NewString()
}
}
// RateLimitedTimedQueue is a unique item priority queue ordered by
// the expected next time of execution. It is also rate limited.
type RateLimitedTimedQueue struct {
queue UniqueQueue
limiterLock sync.Mutex
limiter flowcontrol.RateLimiter
}
// NewRateLimitedTimedQueue creates new queue which will use given
// RateLimiter to oversee execution.
func NewRateLimitedTimedQueue(limiter flowcontrol.RateLimiter) *RateLimitedTimedQueue {
return &RateLimitedTimedQueue{
queue: UniqueQueue{
queue: TimedQueue{},
set: sets.NewString(),
},
limiter: limiter,
}
}
// ActionFunc takes a timed value and returns false if the item must
// be retried, with an optional time.Duration if some minimum wait
// interval should be used.
type ActionFunc func(TimedValue) (bool, time.Duration)
// Try processes the queue.Ends prematurely if RateLimiter forbids an
// action and leak is true. Otherwise, requeues the item to be
// processed. Each value is processed once if fn returns true,
// otherwise it is added back to the queue. The returned remaining is
// used to identify the minimum time to execute the next item in the
// queue. The same value is processed only once unless Remove is
// explicitly called on it (it's done by the cancelPodEviction
// function in NodeController when Node becomes Ready again) TODO:
// figure out a good way to do garbage collection for all Nodes that
// were removed from the cluster.
func (q *RateLimitedTimedQueue) Try(fn ActionFunc) {
val, ok := q.queue.Head()
q.limiterLock.Lock()
defer q.limiterLock.Unlock()
for ok {
// rate limit the queue checking
if !q.limiter.TryAccept() {
klog.V(10).Infof("Try rate limited for value: %v", val)
// Try again later
break
}
now := now()
if now.Before(val.ProcessAt) {
break
}
if ok, wait := fn(val); !ok {
val.ProcessAt = now.Add(wait + 1)
q.queue.Replace(val)
} else {
q.queue.RemoveFromQueue(val.Value)
}
val, ok = q.queue.Head()
}
}
// Add value to the queue to be processed. Won't add the same
// value(comparison by value) a second time if it was already added
// and not removed.
func (q *RateLimitedTimedQueue) Add(value string, uid interface{}) bool {
now := now()
return q.queue.Add(TimedValue{
Value: value,
UID: uid,
AddedAt: now,
ProcessAt: now,
})
}
// Remove Node from the Evictor. The Node won't be processed until
// added again.
func (q *RateLimitedTimedQueue) Remove(value string) bool {
return q.queue.Remove(value)
}
// Clear removes all items from the queue
func (q *RateLimitedTimedQueue) Clear() {
q.queue.Clear()
}
// SwapLimiter safely swaps current limiter for this queue with the
// passed one if capacities or qps's differ.
func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) {
q.limiterLock.Lock()
defer q.limiterLock.Unlock()
if q.limiter.QPS() == newQPS {
return
}
var newLimiter flowcontrol.RateLimiter
if newQPS <= 0 {
newLimiter = flowcontrol.NewFakeNeverRateLimiter()
} else {
newLimiter = flowcontrol.NewTokenBucketRateLimiter(newQPS, EvictionRateLimiterBurst)
// If we're currently waiting on limiter, we drain the new one - this is a good approach when Burst value is 1
// TODO: figure out if we need to support higher Burst values and decide on the drain logic, should we keep:
// - saturation (percentage of used tokens)
// - number of used tokens
// - number of available tokens
// - something else
if q.limiter.TryAccept() == false {
newLimiter.TryAccept()
}
}
q.limiter.Stop()
q.limiter = newLimiter
}

View File

@ -0,0 +1,500 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"fmt"
"hash/fnv"
"io"
"sync"
"time"
"k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/apis/core/helper"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/klog"
)
const (
// TODO (k82cn): Figure out a reasonable number of workers/channels and propagate
// the number of workers up making it a paramater of Run() function.
// NodeUpdateChannelSize defines the size of channel for node update events.
NodeUpdateChannelSize = 10
// UpdateWorkerSize defines the size of workers for node update or/and pod update.
UpdateWorkerSize = 8
podUpdateChannelSize = 1
retries = 5
)
type nodeUpdateItem struct {
nodeName string
}
type podUpdateItem struct {
podName string
podNamespace string
nodeName string
}
func hash(val string, max int) int {
hasher := fnv.New32a()
io.WriteString(hasher, val)
return int(hasher.Sum32() % uint32(max))
}
// GetPodFunc returns the pod for the specified name/namespace, or a NotFound error if missing.
type GetPodFunc func(name, namespace string) (*v1.Pod, error)
// GetNodeFunc returns the node for the specified name, or a NotFound error if missing.
type GetNodeFunc func(name string) (*v1.Node, error)
// NoExecuteTaintManager listens to Taint/Toleration changes and is responsible for removing Pods
// from Nodes tainted with NoExecute Taints.
type NoExecuteTaintManager struct {
client clientset.Interface
recorder record.EventRecorder
getPod GetPodFunc
getNode GetNodeFunc
taintEvictionQueue *TimedWorkerQueue
// keeps a map from nodeName to all noExecute taints on that Node
taintedNodesLock sync.Mutex
taintedNodes map[string][]v1.Taint
nodeUpdateChannels []chan nodeUpdateItem
podUpdateChannels []chan podUpdateItem
nodeUpdateQueue workqueue.Interface
podUpdateQueue workqueue.Interface
}
func deletePodHandler(c clientset.Interface, emitEventFunc func(types.NamespacedName)) func(args *WorkArgs) error {
return func(args *WorkArgs) error {
ns := args.NamespacedName.Namespace
name := args.NamespacedName.Name
klog.V(0).Infof("NoExecuteTaintManager is deleting Pod: %v", args.NamespacedName.String())
if emitEventFunc != nil {
emitEventFunc(args.NamespacedName)
}
var err error
for i := 0; i < retries; i++ {
err = c.CoreV1().Pods(ns).Delete(name, &metav1.DeleteOptions{})
if err == nil {
break
}
time.Sleep(10 * time.Millisecond)
}
return err
}
}
func getNoExecuteTaints(taints []v1.Taint) []v1.Taint {
result := []v1.Taint{}
for i := range taints {
if taints[i].Effect == v1.TaintEffectNoExecute {
result = append(result, taints[i])
}
}
return result
}
func getPodsAssignedToNode(c clientset.Interface, nodeName string) ([]v1.Pod, error) {
selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName})
pods, err := c.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
FieldSelector: selector.String(),
LabelSelector: labels.Everything().String(),
})
for i := 0; i < retries && err != nil; i++ {
pods, err = c.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{
FieldSelector: selector.String(),
LabelSelector: labels.Everything().String(),
})
time.Sleep(100 * time.Millisecond)
}
if err != nil {
return []v1.Pod{}, fmt.Errorf("failed to get Pods assigned to node %v", nodeName)
}
return pods.Items, nil
}
// getMinTolerationTime returns minimal toleration time from the given slice, or -1 if it's infinite.
func getMinTolerationTime(tolerations []v1.Toleration) time.Duration {
minTolerationTime := int64(-1)
if len(tolerations) == 0 {
return 0
}
for i := range tolerations {
if tolerations[i].TolerationSeconds != nil {
tolerationSeconds := *(tolerations[i].TolerationSeconds)
if tolerationSeconds <= 0 {
return 0
} else if tolerationSeconds < minTolerationTime || minTolerationTime == -1 {
minTolerationTime = tolerationSeconds
}
}
}
return time.Duration(minTolerationTime) * time.Second
}
// NewNoExecuteTaintManager creates a new NoExecuteTaintManager that will use passed clientset to
// communicate with the API server.
func NewNoExecuteTaintManager(c clientset.Interface, getPod GetPodFunc, getNode GetNodeFunc) *NoExecuteTaintManager {
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "taint-controller"})
eventBroadcaster.StartLogging(klog.Infof)
if c != nil {
klog.V(0).Infof("Sending events to api server.")
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: c.CoreV1().Events("")})
} else {
klog.Fatalf("kubeClient is nil when starting NodeController")
}
tm := &NoExecuteTaintManager{
client: c,
recorder: recorder,
getPod: getPod,
getNode: getNode,
taintedNodes: make(map[string][]v1.Taint),
nodeUpdateQueue: workqueue.NewNamed("noexec_taint_node"),
podUpdateQueue: workqueue.NewNamed("noexec_taint_pod"),
}
tm.taintEvictionQueue = CreateWorkerQueue(deletePodHandler(c, tm.emitPodDeletionEvent))
return tm
}
// Run starts NoExecuteTaintManager which will run in loop until `stopCh` is closed.
func (tc *NoExecuteTaintManager) Run(stopCh <-chan struct{}) {
klog.V(0).Infof("Starting NoExecuteTaintManager")
for i := 0; i < UpdateWorkerSize; i++ {
tc.nodeUpdateChannels = append(tc.nodeUpdateChannels, make(chan nodeUpdateItem, NodeUpdateChannelSize))
tc.podUpdateChannels = append(tc.podUpdateChannels, make(chan podUpdateItem, podUpdateChannelSize))
}
// Functions that are responsible for taking work items out of the workqueues and putting them
// into channels.
go func(stopCh <-chan struct{}) {
for {
item, shutdown := tc.nodeUpdateQueue.Get()
if shutdown {
break
}
nodeUpdate := item.(nodeUpdateItem)
hash := hash(nodeUpdate.nodeName, UpdateWorkerSize)
select {
case <-stopCh:
tc.nodeUpdateQueue.Done(item)
return
case tc.nodeUpdateChannels[hash] <- nodeUpdate:
// tc.nodeUpdateQueue.Done is called by the nodeUpdateChannels worker
}
}
}(stopCh)
go func(stopCh <-chan struct{}) {
for {
item, shutdown := tc.podUpdateQueue.Get()
if shutdown {
break
}
podUpdate := item.(podUpdateItem)
hash := hash(podUpdate.nodeName, UpdateWorkerSize)
select {
case <-stopCh:
tc.podUpdateQueue.Done(item)
return
case tc.podUpdateChannels[hash] <- podUpdate:
// tc.podUpdateQueue.Done is called by the podUpdateChannels worker
}
}
}(stopCh)
wg := sync.WaitGroup{}
wg.Add(UpdateWorkerSize)
for i := 0; i < UpdateWorkerSize; i++ {
go tc.worker(i, wg.Done, stopCh)
}
wg.Wait()
}
func (tc *NoExecuteTaintManager) worker(worker int, done func(), stopCh <-chan struct{}) {
defer done()
// When processing events we want to prioritize Node updates over Pod updates,
// as NodeUpdates that interest NoExecuteTaintManager should be handled as soon as possible -
// we don't want user (or system) to wait until PodUpdate queue is drained before it can
// start evicting Pods from tainted Nodes.
for {
select {
case <-stopCh:
return
case nodeUpdate := <-tc.nodeUpdateChannels[worker]:
tc.handleNodeUpdate(nodeUpdate)
tc.nodeUpdateQueue.Done(nodeUpdate)
case podUpdate := <-tc.podUpdateChannels[worker]:
// If we found a Pod update we need to empty Node queue first.
priority:
for {
select {
case nodeUpdate := <-tc.nodeUpdateChannels[worker]:
tc.handleNodeUpdate(nodeUpdate)
tc.nodeUpdateQueue.Done(nodeUpdate)
default:
break priority
}
}
// After Node queue is emptied we process podUpdate.
tc.handlePodUpdate(podUpdate)
tc.podUpdateQueue.Done(podUpdate)
}
}
}
// PodUpdated is used to notify NoExecuteTaintManager about Pod changes.
func (tc *NoExecuteTaintManager) PodUpdated(oldPod *v1.Pod, newPod *v1.Pod) {
podName := ""
podNamespace := ""
nodeName := ""
oldTolerations := []v1.Toleration{}
if oldPod != nil {
podName = oldPod.Name
podNamespace = oldPod.Namespace
nodeName = oldPod.Spec.NodeName
oldTolerations = oldPod.Spec.Tolerations
}
newTolerations := []v1.Toleration{}
if newPod != nil {
podName = newPod.Name
podNamespace = newPod.Namespace
nodeName = newPod.Spec.NodeName
newTolerations = newPod.Spec.Tolerations
}
if oldPod != nil && newPod != nil && helper.Semantic.DeepEqual(oldTolerations, newTolerations) && oldPod.Spec.NodeName == newPod.Spec.NodeName {
return
}
updateItem := podUpdateItem{
podName: podName,
podNamespace: podNamespace,
nodeName: nodeName,
}
tc.podUpdateQueue.Add(updateItem)
}
// NodeUpdated is used to notify NoExecuteTaintManager about Node changes.
func (tc *NoExecuteTaintManager) NodeUpdated(oldNode *v1.Node, newNode *v1.Node) {
nodeName := ""
oldTaints := []v1.Taint{}
if oldNode != nil {
nodeName = oldNode.Name
oldTaints = getNoExecuteTaints(oldNode.Spec.Taints)
}
newTaints := []v1.Taint{}
if newNode != nil {
nodeName = newNode.Name
newTaints = getNoExecuteTaints(newNode.Spec.Taints)
}
if oldNode != nil && newNode != nil && helper.Semantic.DeepEqual(oldTaints, newTaints) {
return
}
updateItem := nodeUpdateItem{
nodeName: nodeName,
}
tc.nodeUpdateQueue.Add(updateItem)
}
func (tc *NoExecuteTaintManager) cancelWorkWithEvent(nsName types.NamespacedName) {
if tc.taintEvictionQueue.CancelWork(nsName.String()) {
tc.emitCancelPodDeletionEvent(nsName)
}
}
func (tc *NoExecuteTaintManager) processPodOnNode(
podNamespacedName types.NamespacedName,
nodeName string,
tolerations []v1.Toleration,
taints []v1.Taint,
now time.Time,
) {
if len(taints) == 0 {
tc.cancelWorkWithEvent(podNamespacedName)
}
allTolerated, usedTolerations := v1helper.GetMatchingTolerations(taints, tolerations)
if !allTolerated {
klog.V(2).Infof("Not all taints are tolerated after update for Pod %v on %v", podNamespacedName.String(), nodeName)
// We're canceling scheduled work (if any), as we're going to delete the Pod right away.
tc.cancelWorkWithEvent(podNamespacedName)
tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), time.Now(), time.Now())
return
}
minTolerationTime := getMinTolerationTime(usedTolerations)
// getMinTolerationTime returns negative value to denote infinite toleration.
if minTolerationTime < 0 {
klog.V(4).Infof("New tolerations for %v tolerate forever. Scheduled deletion won't be cancelled if already scheduled.", podNamespacedName.String())
return
}
startTime := now
triggerTime := startTime.Add(minTolerationTime)
scheduledEviction := tc.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String())
if scheduledEviction != nil {
startTime = scheduledEviction.CreatedAt
if startTime.Add(minTolerationTime).Before(triggerTime) {
return
}
tc.cancelWorkWithEvent(podNamespacedName)
}
tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime)
}
func (tc *NoExecuteTaintManager) handlePodUpdate(podUpdate podUpdateItem) {
pod, err := tc.getPod(podUpdate.podName, podUpdate.podNamespace)
if err != nil {
if apierrors.IsNotFound(err) {
// Delete
podNamespacedName := types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName}
klog.V(4).Infof("Noticed pod deletion: %#v", podNamespacedName)
tc.cancelWorkWithEvent(podNamespacedName)
return
}
utilruntime.HandleError(fmt.Errorf("could not get pod %s/%s: %v", podUpdate.podName, podUpdate.podNamespace, err))
return
}
// We key the workqueue and shard workers by nodeName. If we don't match the current state we should not be the one processing the current object.
if pod.Spec.NodeName != podUpdate.nodeName {
return
}
// Create or Update
podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
klog.V(4).Infof("Noticed pod update: %#v", podNamespacedName)
nodeName := pod.Spec.NodeName
if nodeName == "" {
return
}
taints, ok := func() ([]v1.Taint, bool) {
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
taints, ok := tc.taintedNodes[nodeName]
return taints, ok
}()
// It's possible that Node was deleted, or Taints were removed before, which triggered
// eviction cancelling if it was needed.
if !ok {
return
}
tc.processPodOnNode(podNamespacedName, nodeName, pod.Spec.Tolerations, taints, time.Now())
}
func (tc *NoExecuteTaintManager) handleNodeUpdate(nodeUpdate nodeUpdateItem) {
node, err := tc.getNode(nodeUpdate.nodeName)
if err != nil {
if apierrors.IsNotFound(err) {
// Delete
klog.V(4).Infof("Noticed node deletion: %#v", nodeUpdate.nodeName)
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
delete(tc.taintedNodes, nodeUpdate.nodeName)
return
}
utilruntime.HandleError(fmt.Errorf("cannot get node %s: %v", nodeUpdate.nodeName, err))
return
}
// Create or Update
klog.V(4).Infof("Noticed node update: %#v", nodeUpdate)
taints := getNoExecuteTaints(node.Spec.Taints)
func() {
tc.taintedNodesLock.Lock()
defer tc.taintedNodesLock.Unlock()
klog.V(4).Infof("Updating known taints on node %v: %v", node.Name, taints)
if len(taints) == 0 {
delete(tc.taintedNodes, node.Name)
} else {
tc.taintedNodes[node.Name] = taints
}
}()
pods, err := getPodsAssignedToNode(tc.client, node.Name)
if err != nil {
klog.Errorf(err.Error())
return
}
if len(pods) == 0 {
return
}
// Short circuit, to make this controller a bit faster.
if len(taints) == 0 {
klog.V(4).Infof("All taints were removed from the Node %v. Cancelling all evictions...", node.Name)
for i := range pods {
tc.cancelWorkWithEvent(types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name})
}
return
}
now := time.Now()
for i := range pods {
pod := &pods[i]
podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
tc.processPodOnNode(podNamespacedName, node.Name, pod.Spec.Tolerations, taints, now)
}
}
func (tc *NoExecuteTaintManager) emitPodDeletionEvent(nsName types.NamespacedName) {
if tc.recorder == nil {
return
}
ref := &v1.ObjectReference{
Kind: "Pod",
Name: nsName.Name,
Namespace: nsName.Namespace,
}
tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Marking for deletion Pod %s", nsName.String())
}
func (tc *NoExecuteTaintManager) emitCancelPodDeletionEvent(nsName types.NamespacedName) {
if tc.recorder == nil {
return
}
ref := &v1.ObjectReference{
Kind: "Pod",
Name: nsName.Name,
Namespace: nsName.Namespace,
}
tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Cancelling deletion of Pod %s", nsName.String())
}

View File

@ -0,0 +1,145 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"sync"
"time"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
)
// WorkArgs keeps arguments that will be passed to the function executed by the worker.
type WorkArgs struct {
NamespacedName types.NamespacedName
}
// KeyFromWorkArgs creates a key for the given `WorkArgs`
func (w *WorkArgs) KeyFromWorkArgs() string {
return w.NamespacedName.String()
}
// NewWorkArgs is a helper function to create new `WorkArgs`
func NewWorkArgs(name, namespace string) *WorkArgs {
return &WorkArgs{types.NamespacedName{Namespace: namespace, Name: name}}
}
// TimedWorker is a responsible for executing a function no earlier than at FireAt time.
type TimedWorker struct {
WorkItem *WorkArgs
CreatedAt time.Time
FireAt time.Time
Timer *time.Timer
}
// CreateWorker creates a TimedWorker that will execute `f` not earlier than `fireAt`.
func CreateWorker(args *WorkArgs, createdAt time.Time, fireAt time.Time, f func(args *WorkArgs) error) *TimedWorker {
delay := fireAt.Sub(createdAt)
if delay <= 0 {
go f(args)
return nil
}
timer := time.AfterFunc(delay, func() { f(args) })
return &TimedWorker{
WorkItem: args,
CreatedAt: createdAt,
FireAt: fireAt,
Timer: timer,
}
}
// Cancel cancels the execution of function by the `TimedWorker`
func (w *TimedWorker) Cancel() {
if w != nil {
w.Timer.Stop()
}
}
// TimedWorkerQueue keeps a set of TimedWorkers that are still wait for execution.
type TimedWorkerQueue struct {
sync.Mutex
// map of workers keyed by string returned by 'KeyFromWorkArgs' from the given worker.
workers map[string]*TimedWorker
workFunc func(args *WorkArgs) error
}
// CreateWorkerQueue creates a new TimedWorkerQueue for workers that will execute
// given function `f`.
func CreateWorkerQueue(f func(args *WorkArgs) error) *TimedWorkerQueue {
return &TimedWorkerQueue{
workers: make(map[string]*TimedWorker),
workFunc: f,
}
}
func (q *TimedWorkerQueue) getWrappedWorkerFunc(key string) func(args *WorkArgs) error {
return func(args *WorkArgs) error {
err := q.workFunc(args)
q.Lock()
defer q.Unlock()
if err == nil {
// To avoid duplicated calls we keep the key in the queue, to prevent
// subsequent additions.
q.workers[key] = nil
} else {
delete(q.workers, key)
}
return err
}
}
// AddWork adds a work to the WorkerQueue which will be executed not earlier than `fireAt`.
func (q *TimedWorkerQueue) AddWork(args *WorkArgs, createdAt time.Time, fireAt time.Time) {
key := args.KeyFromWorkArgs()
klog.V(4).Infof("Adding TimedWorkerQueue item %v at %v to be fired at %v", key, createdAt, fireAt)
q.Lock()
defer q.Unlock()
if _, exists := q.workers[key]; exists {
klog.Warningf("Trying to add already existing work for %+v. Skipping.", args)
return
}
worker := CreateWorker(args, createdAt, fireAt, q.getWrappedWorkerFunc(key))
q.workers[key] = worker
}
// CancelWork removes scheduled function execution from the queue. Returns true if work was cancelled.
func (q *TimedWorkerQueue) CancelWork(key string) bool {
q.Lock()
defer q.Unlock()
worker, found := q.workers[key]
result := false
if found {
klog.V(4).Infof("Cancelling TimedWorkerQueue item %v at %v", key, time.Now())
if worker != nil {
result = true
worker.Cancel()
}
delete(q.workers, key)
}
return result
}
// GetWorkerUnsafe returns a TimedWorker corresponding to the given key.
// Unsafe method - workers have attached goroutines which can fire afater this function is called.
func (q *TimedWorkerQueue) GetWorkerUnsafe(key string) *TimedWorker {
q.Lock()
defer q.Unlock()
return q.workers[key]
}

19
vendor/k8s.io/kubernetes/pkg/controller/service/doc.go generated vendored Normal file
View File

@ -0,0 +1,19 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package service contains code for syncing cloud load balancers
// with the service registry.
package service // import "k8s.io/kubernetes/pkg/controller/service"

View File

@ -0,0 +1,769 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package service
import (
"context"
"fmt"
"sync"
"time"
"reflect"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/controller"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/util/metrics"
)
const (
// Interval of synchronizing service status from apiserver
serviceSyncPeriod = 30 * time.Second
// Interval of synchronizing node status from apiserver
nodeSyncPeriod = 100 * time.Second
// How long to wait before retrying the processing of a service change.
// If this changes, the sleep in hack/jenkins/e2e.sh before downing a cluster
// should be changed appropriately.
minRetryDelay = 5 * time.Second
maxRetryDelay = 300 * time.Second
clientRetryCount = 5
clientRetryInterval = 5 * time.Second
// LabelNodeRoleMaster specifies that a node is a master
// It's copied over to kubeadm until it's merged in core: https://github.com/kubernetes/kubernetes/pull/39112
LabelNodeRoleMaster = "node-role.kubernetes.io/master"
// LabelNodeRoleExcludeBalancer specifies that the node should be
// exclude from load balancers created by a cloud provider.
LabelNodeRoleExcludeBalancer = "alpha.service-controller.kubernetes.io/exclude-balancer"
)
type cachedService struct {
// The cached state of the service
state *v1.Service
}
type serviceCache struct {
mu sync.Mutex // protects serviceMap
serviceMap map[string]*cachedService
}
// ServiceController keeps cloud provider service resources
// (like load balancers) in sync with the registry.
type ServiceController struct {
cloud cloudprovider.Interface
knownHosts []*v1.Node
servicesToUpdate []*v1.Service
kubeClient clientset.Interface
clusterName string
balancer cloudprovider.LoadBalancer
cache *serviceCache
serviceLister corelisters.ServiceLister
serviceListerSynced cache.InformerSynced
eventBroadcaster record.EventBroadcaster
eventRecorder record.EventRecorder
nodeLister corelisters.NodeLister
nodeListerSynced cache.InformerSynced
// services that need to be synced
queue workqueue.RateLimitingInterface
}
// New returns a new service controller to keep cloud provider service resources
// (like load balancers) in sync with the registry.
func New(
cloud cloudprovider.Interface,
kubeClient clientset.Interface,
serviceInformer coreinformers.ServiceInformer,
nodeInformer coreinformers.NodeInformer,
clusterName string,
) (*ServiceController, error) {
broadcaster := record.NewBroadcaster()
broadcaster.StartLogging(klog.Infof)
broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
recorder := broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "service-controller"})
if kubeClient != nil && kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
if err := metrics.RegisterMetricAndTrackRateLimiterUsage("service_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter()); err != nil {
return nil, err
}
}
s := &ServiceController{
cloud: cloud,
knownHosts: []*v1.Node{},
kubeClient: kubeClient,
clusterName: clusterName,
cache: &serviceCache{serviceMap: make(map[string]*cachedService)},
eventBroadcaster: broadcaster,
eventRecorder: recorder,
nodeLister: nodeInformer.Lister(),
nodeListerSynced: nodeInformer.Informer().HasSynced,
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(minRetryDelay, maxRetryDelay), "service"),
}
serviceInformer.Informer().AddEventHandlerWithResyncPeriod(
cache.ResourceEventHandlerFuncs{
AddFunc: s.enqueueService,
UpdateFunc: func(old, cur interface{}) {
oldSvc, ok1 := old.(*v1.Service)
curSvc, ok2 := cur.(*v1.Service)
if ok1 && ok2 && s.needsUpdate(oldSvc, curSvc) {
s.enqueueService(cur)
}
},
DeleteFunc: s.enqueueService,
},
serviceSyncPeriod,
)
s.serviceLister = serviceInformer.Lister()
s.serviceListerSynced = serviceInformer.Informer().HasSynced
if err := s.init(); err != nil {
return nil, err
}
return s, nil
}
// obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item.
func (s *ServiceController) enqueueService(obj interface{}) {
key, err := controller.KeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %#v: %v", obj, err)
return
}
s.queue.Add(key)
}
// Run starts a background goroutine that watches for changes to services that
// have (or had) LoadBalancers=true and ensures that they have
// load balancers created and deleted appropriately.
// serviceSyncPeriod controls how often we check the cluster's services to
// ensure that the correct load balancers exist.
// nodeSyncPeriod controls how often we check the cluster's nodes to determine
// if load balancers need to be updated to point to a new set.
//
// It's an error to call Run() more than once for a given ServiceController
// object.
func (s *ServiceController) Run(stopCh <-chan struct{}, workers int) {
defer runtime.HandleCrash()
defer s.queue.ShutDown()
klog.Info("Starting service controller")
defer klog.Info("Shutting down service controller")
if !controller.WaitForCacheSync("service", stopCh, s.serviceListerSynced, s.nodeListerSynced) {
return
}
for i := 0; i < workers; i++ {
go wait.Until(s.worker, time.Second, stopCh)
}
go wait.Until(s.nodeSyncLoop, nodeSyncPeriod, stopCh)
<-stopCh
}
// worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key.
func (s *ServiceController) worker() {
for s.processNextWorkItem() {
}
}
func (s *ServiceController) processNextWorkItem() bool {
key, quit := s.queue.Get()
if quit {
return false
}
defer s.queue.Done(key)
err := s.syncService(key.(string))
if err == nil {
s.queue.Forget(key)
return true
}
runtime.HandleError(fmt.Errorf("error processing service %v (will retry): %v", key, err))
s.queue.AddRateLimited(key)
return true
}
func (s *ServiceController) init() error {
if s.cloud == nil {
return fmt.Errorf("WARNING: no cloud provider provided, services of type LoadBalancer will fail")
}
balancer, ok := s.cloud.LoadBalancer()
if !ok {
return fmt.Errorf("the cloud provider does not support external load balancers")
}
s.balancer = balancer
return nil
}
// processServiceUpdate operates loadbalancers for the incoming service accordingly.
// Returns an error if processing the service update failed.
func (s *ServiceController) processServiceUpdate(cachedService *cachedService, service *v1.Service, key string) error {
if cachedService.state != nil {
if cachedService.state.UID != service.UID {
err := s.processLoadBalancerDelete(cachedService, key)
if err != nil {
return err
}
}
}
// cache the service, we need the info for service deletion
cachedService.state = service
err := s.createLoadBalancerIfNeeded(key, service)
if err != nil {
eventType := "CreatingLoadBalancerFailed"
message := "Error creating load balancer (will retry): "
if !wantsLoadBalancer(service) {
eventType = "CleanupLoadBalancerFailed"
message = "Error cleaning up load balancer (will retry): "
}
message += err.Error()
s.eventRecorder.Event(service, v1.EventTypeWarning, eventType, message)
return err
}
// Always update the cache upon success.
// NOTE: Since we update the cached service if and only if we successfully
// processed it, a cached service being nil implies that it hasn't yet
// been successfully processed.
s.cache.set(key, cachedService)
return nil
}
// createLoadBalancerIfNeeded ensures that service's status is synced up with loadbalancer
// i.e. creates loadbalancer for service if requested and deletes loadbalancer if the service
// doesn't want a loadbalancer no more. Returns whatever error occurred.
func (s *ServiceController) createLoadBalancerIfNeeded(key string, service *v1.Service) error {
// Note: It is safe to just call EnsureLoadBalancer. But, on some clouds that requires a delete & create,
// which may involve service interruption. Also, we would like user-friendly events.
// Save the state so we can avoid a write if it doesn't change
previousState := v1helper.LoadBalancerStatusDeepCopy(&service.Status.LoadBalancer)
var newState *v1.LoadBalancerStatus
var err error
if !wantsLoadBalancer(service) {
_, exists, err := s.balancer.GetLoadBalancer(context.TODO(), s.clusterName, service)
if err != nil {
return fmt.Errorf("error getting LB for service %s: %v", key, err)
}
if exists {
klog.Infof("Deleting existing load balancer for service %s that no longer needs a load balancer.", key)
s.eventRecorder.Event(service, v1.EventTypeNormal, "DeletingLoadBalancer", "Deleting load balancer")
if err := s.balancer.EnsureLoadBalancerDeleted(context.TODO(), s.clusterName, service); err != nil {
return err
}
s.eventRecorder.Event(service, v1.EventTypeNormal, "DeletedLoadBalancer", "Deleted load balancer")
}
newState = &v1.LoadBalancerStatus{}
} else {
klog.V(2).Infof("Ensuring LB for service %s", key)
// TODO: We could do a dry-run here if wanted to avoid the spurious cloud-calls & events when we restart
s.eventRecorder.Event(service, v1.EventTypeNormal, "EnsuringLoadBalancer", "Ensuring load balancer")
newState, err = s.ensureLoadBalancer(service)
if err != nil {
return fmt.Errorf("failed to ensure load balancer for service %s: %v", key, err)
}
s.eventRecorder.Event(service, v1.EventTypeNormal, "EnsuredLoadBalancer", "Ensured load balancer")
}
// Write the state if changed
// TODO: Be careful here ... what if there were other changes to the service?
if !v1helper.LoadBalancerStatusEqual(previousState, newState) {
// Make a copy so we don't mutate the shared informer cache
service = service.DeepCopy()
// Update the status on the copy
service.Status.LoadBalancer = *newState
if err := s.persistUpdate(service); err != nil {
// TODO: This logic needs to be revisited. We might want to retry on all the errors, not just conflicts.
if errors.IsConflict(err) {
return fmt.Errorf("not persisting update to service '%s/%s' that has been changed since we received it: %v", service.Namespace, service.Name, err)
}
runtime.HandleError(fmt.Errorf("failed to persist service %q updated status to apiserver, even after retries. Giving up: %v", key, err))
return nil
}
} else {
klog.V(2).Infof("Not persisting unchanged LoadBalancerStatus for service %s to registry.", key)
}
return nil
}
func (s *ServiceController) persistUpdate(service *v1.Service) error {
var err error
for i := 0; i < clientRetryCount; i++ {
_, err = s.kubeClient.CoreV1().Services(service.Namespace).UpdateStatus(service)
if err == nil {
return nil
}
// If the object no longer exists, we don't want to recreate it. Just bail
// out so that we can process the delete, which we should soon be receiving
// if we haven't already.
if errors.IsNotFound(err) {
klog.Infof("Not persisting update to service '%s/%s' that no longer exists: %v",
service.Namespace, service.Name, err)
return nil
}
// TODO: Try to resolve the conflict if the change was unrelated to load
// balancer status. For now, just pass it up the stack.
if errors.IsConflict(err) {
return err
}
klog.Warningf("Failed to persist updated LoadBalancerStatus to service '%s/%s' after creating its load balancer: %v",
service.Namespace, service.Name, err)
time.Sleep(clientRetryInterval)
}
return err
}
func (s *ServiceController) ensureLoadBalancer(service *v1.Service) (*v1.LoadBalancerStatus, error) {
nodes, err := s.nodeLister.ListWithPredicate(getNodeConditionPredicate())
if err != nil {
return nil, err
}
// If there are no available nodes for LoadBalancer service, make a EventTypeWarning event for it.
if len(nodes) == 0 {
s.eventRecorder.Eventf(service, v1.EventTypeWarning, "UnAvailableLoadBalancer", "There are no available nodes for LoadBalancer service %s/%s", service.Namespace, service.Name)
}
// - Only one protocol supported per service
// - Not all cloud providers support all protocols and the next step is expected to return
// an error for unsupported protocols
return s.balancer.EnsureLoadBalancer(context.TODO(), s.clusterName, service, nodes)
}
// ListKeys implements the interface required by DeltaFIFO to list the keys we
// already know about.
func (s *serviceCache) ListKeys() []string {
s.mu.Lock()
defer s.mu.Unlock()
keys := make([]string, 0, len(s.serviceMap))
for k := range s.serviceMap {
keys = append(keys, k)
}
return keys
}
// GetByKey returns the value stored in the serviceMap under the given key
func (s *serviceCache) GetByKey(key string) (interface{}, bool, error) {
s.mu.Lock()
defer s.mu.Unlock()
if v, ok := s.serviceMap[key]; ok {
return v, true, nil
}
return nil, false, nil
}
// ListKeys implements the interface required by DeltaFIFO to list the keys we
// already know about.
func (s *serviceCache) allServices() []*v1.Service {
s.mu.Lock()
defer s.mu.Unlock()
services := make([]*v1.Service, 0, len(s.serviceMap))
for _, v := range s.serviceMap {
services = append(services, v.state)
}
return services
}
func (s *serviceCache) get(serviceName string) (*cachedService, bool) {
s.mu.Lock()
defer s.mu.Unlock()
service, ok := s.serviceMap[serviceName]
return service, ok
}
func (s *serviceCache) getOrCreate(serviceName string) *cachedService {
s.mu.Lock()
defer s.mu.Unlock()
service, ok := s.serviceMap[serviceName]
if !ok {
service = &cachedService{}
s.serviceMap[serviceName] = service
}
return service
}
func (s *serviceCache) set(serviceName string, service *cachedService) {
s.mu.Lock()
defer s.mu.Unlock()
s.serviceMap[serviceName] = service
}
func (s *serviceCache) delete(serviceName string) {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.serviceMap, serviceName)
}
func (s *ServiceController) needsUpdate(oldService *v1.Service, newService *v1.Service) bool {
if !wantsLoadBalancer(oldService) && !wantsLoadBalancer(newService) {
return false
}
if wantsLoadBalancer(oldService) != wantsLoadBalancer(newService) {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "Type", "%v -> %v",
oldService.Spec.Type, newService.Spec.Type)
return true
}
if wantsLoadBalancer(newService) && !reflect.DeepEqual(oldService.Spec.LoadBalancerSourceRanges, newService.Spec.LoadBalancerSourceRanges) {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "LoadBalancerSourceRanges", "%v -> %v",
oldService.Spec.LoadBalancerSourceRanges, newService.Spec.LoadBalancerSourceRanges)
return true
}
if !portsEqualForLB(oldService, newService) || oldService.Spec.SessionAffinity != newService.Spec.SessionAffinity {
return true
}
if !loadBalancerIPsAreEqual(oldService, newService) {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "LoadbalancerIP", "%v -> %v",
oldService.Spec.LoadBalancerIP, newService.Spec.LoadBalancerIP)
return true
}
if len(oldService.Spec.ExternalIPs) != len(newService.Spec.ExternalIPs) {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "ExternalIP", "Count: %v -> %v",
len(oldService.Spec.ExternalIPs), len(newService.Spec.ExternalIPs))
return true
}
for i := range oldService.Spec.ExternalIPs {
if oldService.Spec.ExternalIPs[i] != newService.Spec.ExternalIPs[i] {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "ExternalIP", "Added: %v",
newService.Spec.ExternalIPs[i])
return true
}
}
if !reflect.DeepEqual(oldService.Annotations, newService.Annotations) {
return true
}
if oldService.UID != newService.UID {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "UID", "%v -> %v",
oldService.UID, newService.UID)
return true
}
if oldService.Spec.ExternalTrafficPolicy != newService.Spec.ExternalTrafficPolicy {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "ExternalTrafficPolicy", "%v -> %v",
oldService.Spec.ExternalTrafficPolicy, newService.Spec.ExternalTrafficPolicy)
return true
}
if oldService.Spec.HealthCheckNodePort != newService.Spec.HealthCheckNodePort {
s.eventRecorder.Eventf(newService, v1.EventTypeNormal, "HealthCheckNodePort", "%v -> %v",
oldService.Spec.HealthCheckNodePort, newService.Spec.HealthCheckNodePort)
return true
}
return false
}
func (s *ServiceController) loadBalancerName(service *v1.Service) string {
return s.balancer.GetLoadBalancerName(context.TODO(), "", service)
}
func getPortsForLB(service *v1.Service) ([]*v1.ServicePort, error) {
var protocol v1.Protocol
ports := []*v1.ServicePort{}
for i := range service.Spec.Ports {
sp := &service.Spec.Ports[i]
// The check on protocol was removed here. The cloud provider itself is now responsible for all protocol validation
ports = append(ports, sp)
if protocol == "" {
protocol = sp.Protocol
} else if protocol != sp.Protocol && wantsLoadBalancer(service) {
// TODO: Convert error messages to use event recorder
return nil, fmt.Errorf("mixed protocol external load balancers are not supported")
}
}
return ports, nil
}
func portsEqualForLB(x, y *v1.Service) bool {
xPorts, err := getPortsForLB(x)
if err != nil {
return false
}
yPorts, err := getPortsForLB(y)
if err != nil {
return false
}
return portSlicesEqualForLB(xPorts, yPorts)
}
func portSlicesEqualForLB(x, y []*v1.ServicePort) bool {
if len(x) != len(y) {
return false
}
for i := range x {
if !portEqualForLB(x[i], y[i]) {
return false
}
}
return true
}
func portEqualForLB(x, y *v1.ServicePort) bool {
// TODO: Should we check name? (In theory, an LB could expose it)
if x.Name != y.Name {
return false
}
if x.Protocol != y.Protocol {
return false
}
if x.Port != y.Port {
return false
}
if x.NodePort != y.NodePort {
return false
}
// We don't check TargetPort; that is not relevant for load balancing
// TODO: Should we blank it out? Or just check it anyway?
return true
}
func nodeNames(nodes []*v1.Node) sets.String {
ret := sets.NewString()
for _, node := range nodes {
ret.Insert(node.Name)
}
return ret
}
func nodeSlicesEqualForLB(x, y []*v1.Node) bool {
if len(x) != len(y) {
return false
}
return nodeNames(x).Equal(nodeNames(y))
}
func getNodeConditionPredicate() corelisters.NodeConditionPredicate {
return func(node *v1.Node) bool {
// We add the master to the node list, but its unschedulable. So we use this to filter
// the master.
if node.Spec.Unschedulable {
return false
}
// As of 1.6, we will taint the master, but not necessarily mark it unschedulable.
// Recognize nodes labeled as master, and filter them also, as we were doing previously.
if _, hasMasterRoleLabel := node.Labels[LabelNodeRoleMaster]; hasMasterRoleLabel {
return false
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.ServiceNodeExclusion) {
if _, hasExcludeBalancerLabel := node.Labels[LabelNodeRoleExcludeBalancer]; hasExcludeBalancerLabel {
return false
}
}
// If we have no info, don't accept
if len(node.Status.Conditions) == 0 {
return false
}
for _, cond := range node.Status.Conditions {
// We consider the node for load balancing only when its NodeReady condition status
// is ConditionTrue
if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
klog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
}
}
return true
}
}
// nodeSyncLoop handles updating the hosts pointed to by all load
// balancers whenever the set of nodes in the cluster changes.
func (s *ServiceController) nodeSyncLoop() {
newHosts, err := s.nodeLister.ListWithPredicate(getNodeConditionPredicate())
if err != nil {
klog.Errorf("Failed to retrieve current set of nodes from node lister: %v", err)
return
}
if nodeSlicesEqualForLB(newHosts, s.knownHosts) {
// The set of nodes in the cluster hasn't changed, but we can retry
// updating any services that we failed to update last time around.
s.servicesToUpdate = s.updateLoadBalancerHosts(s.servicesToUpdate, newHosts)
return
}
klog.Infof("Detected change in list of current cluster nodes. New node set: %v",
nodeNames(newHosts))
// Try updating all services, and save the ones that fail to try again next
// round.
s.servicesToUpdate = s.cache.allServices()
numServices := len(s.servicesToUpdate)
s.servicesToUpdate = s.updateLoadBalancerHosts(s.servicesToUpdate, newHosts)
klog.Infof("Successfully updated %d out of %d load balancers to direct traffic to the updated set of nodes",
numServices-len(s.servicesToUpdate), numServices)
s.knownHosts = newHosts
}
// updateLoadBalancerHosts updates all existing load balancers so that
// they will match the list of hosts provided.
// Returns the list of services that couldn't be updated.
func (s *ServiceController) updateLoadBalancerHosts(services []*v1.Service, hosts []*v1.Node) (servicesToRetry []*v1.Service) {
for _, service := range services {
func() {
if service == nil {
return
}
if err := s.lockedUpdateLoadBalancerHosts(service, hosts); err != nil {
klog.Errorf("External error while updating load balancer: %v.", err)
servicesToRetry = append(servicesToRetry, service)
}
}()
}
return servicesToRetry
}
// Updates the load balancer of a service, assuming we hold the mutex
// associated with the service.
func (s *ServiceController) lockedUpdateLoadBalancerHosts(service *v1.Service, hosts []*v1.Node) error {
if !wantsLoadBalancer(service) {
return nil
}
// This operation doesn't normally take very long (and happens pretty often), so we only record the final event
err := s.balancer.UpdateLoadBalancer(context.TODO(), s.clusterName, service, hosts)
if err == nil {
// If there are no available nodes for LoadBalancer service, make a EventTypeWarning event for it.
if len(hosts) == 0 {
s.eventRecorder.Eventf(service, v1.EventTypeWarning, "UnAvailableLoadBalancer", "There are no available nodes for LoadBalancer service %s/%s", service.Namespace, service.Name)
} else {
s.eventRecorder.Event(service, v1.EventTypeNormal, "UpdatedLoadBalancer", "Updated load balancer with new hosts")
}
return nil
}
// It's only an actual error if the load balancer still exists.
if _, exists, err := s.balancer.GetLoadBalancer(context.TODO(), s.clusterName, service); err != nil {
klog.Errorf("External error while checking if load balancer %q exists: name, %v", s.balancer.GetLoadBalancerName(context.TODO(), s.clusterName, service), err)
} else if !exists {
return nil
}
s.eventRecorder.Eventf(service, v1.EventTypeWarning, "LoadBalancerUpdateFailed", "Error updating load balancer with new hosts %v: %v", nodeNames(hosts), err)
return err
}
func wantsLoadBalancer(service *v1.Service) bool {
return service.Spec.Type == v1.ServiceTypeLoadBalancer
}
func loadBalancerIPsAreEqual(oldService, newService *v1.Service) bool {
return oldService.Spec.LoadBalancerIP == newService.Spec.LoadBalancerIP
}
// syncService will sync the Service with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (s *ServiceController) syncService(key string) error {
startTime := time.Now()
var cachedService *cachedService
defer func() {
klog.V(4).Infof("Finished syncing service %q (%v)", key, time.Since(startTime))
}()
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
}
// service holds the latest service info from apiserver
service, err := s.serviceLister.Services(namespace).Get(name)
switch {
case errors.IsNotFound(err):
// service absence in store means watcher caught the deletion, ensure LB info is cleaned
klog.Infof("Service has been deleted %v. Attempting to cleanup load balancer resources", key)
err = s.processServiceDeletion(key)
case err != nil:
klog.Infof("Unable to retrieve service %v from store: %v", key, err)
default:
cachedService = s.cache.getOrCreate(key)
err = s.processServiceUpdate(cachedService, service, key)
}
return err
}
// Returns an error if processing the service deletion failed, along with a time.Duration
// indicating whether processing should be retried; zero means no-retry; otherwise
// we should retry after that Duration.
func (s *ServiceController) processServiceDeletion(key string) error {
cachedService, ok := s.cache.get(key)
if !ok {
klog.Errorf("service %s not in cache even though the watcher thought it was. Ignoring the deletion", key)
return nil
}
return s.processLoadBalancerDelete(cachedService, key)
}
func (s *ServiceController) processLoadBalancerDelete(cachedService *cachedService, key string) error {
service := cachedService.state
// delete load balancer info only if the service type is LoadBalancer
if !wantsLoadBalancer(service) {
return nil
}
s.eventRecorder.Event(service, v1.EventTypeNormal, "DeletingLoadBalancer", "Deleting load balancer")
err := s.balancer.EnsureLoadBalancerDeleted(context.TODO(), s.clusterName, service)
if err != nil {
s.eventRecorder.Eventf(service, v1.EventTypeWarning, "DeletingLoadBalancerFailed", "Error deleting load balancer (will retry): %v", err)
return err
}
s.eventRecorder.Event(service, v1.EventTypeNormal, "DeletedLoadBalancer", "Deleted load balancer")
s.cache.delete(key)
return nil
}

View File

@ -0,0 +1,293 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package node
import (
"fmt"
"strings"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
appsv1listers "k8s.io/client-go/listers/apps/v1"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/kubelet/util/format"
nodepkg "k8s.io/kubernetes/pkg/util/node"
"k8s.io/klog"
)
// DeletePods will delete all pods from master running on given node,
// and return true if any pods were deleted, or were found pending
// deletion.
func DeletePods(kubeClient clientset.Interface, recorder record.EventRecorder, nodeName, nodeUID string, daemonStore appsv1listers.DaemonSetLister) (bool, error) {
remaining := false
selector := fields.OneTermEqualSelector(api.PodHostField, nodeName).String()
options := metav1.ListOptions{FieldSelector: selector}
pods, err := kubeClient.CoreV1().Pods(metav1.NamespaceAll).List(options)
var updateErrList []error
if err != nil {
return remaining, err
}
if len(pods.Items) > 0 {
RecordNodeEvent(recorder, nodeName, nodeUID, v1.EventTypeNormal, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeName))
}
for _, pod := range pods.Items {
// Defensive check, also needed for tests.
if pod.Spec.NodeName != nodeName {
continue
}
// Set reason and message in the pod object.
if _, err = SetPodTerminationReason(kubeClient, &pod, nodeName); err != nil {
if apierrors.IsConflict(err) {
updateErrList = append(updateErrList,
fmt.Errorf("update status failed for pod %q: %v", format.Pod(&pod), err))
continue
}
}
// if the pod has already been marked for deletion, we still return true that there are remaining pods.
if pod.DeletionGracePeriodSeconds != nil {
remaining = true
continue
}
// if the pod is managed by a daemonset, ignore it
_, err := daemonStore.GetPodDaemonSets(&pod)
if err == nil { // No error means at least one daemonset was found
continue
}
klog.V(2).Infof("Starting deletion of pod %v/%v", pod.Namespace, pod.Name)
recorder.Eventf(&pod, v1.EventTypeNormal, "NodeControllerEviction", "Marking for deletion Pod %s from Node %s", pod.Name, nodeName)
if err := kubeClient.CoreV1().Pods(pod.Namespace).Delete(pod.Name, nil); err != nil {
return false, err
}
remaining = true
}
if len(updateErrList) > 0 {
return false, utilerrors.NewAggregate(updateErrList)
}
return remaining, nil
}
// SetPodTerminationReason attempts to set a reason and message in the
// pod status, updates it in the apiserver, and returns an error if it
// encounters one.
func SetPodTerminationReason(kubeClient clientset.Interface, pod *v1.Pod, nodeName string) (*v1.Pod, error) {
if pod.Status.Reason == nodepkg.NodeUnreachablePodReason {
return pod, nil
}
pod.Status.Reason = nodepkg.NodeUnreachablePodReason
pod.Status.Message = fmt.Sprintf(nodepkg.NodeUnreachablePodMessage, nodeName, pod.Name)
var updatedPod *v1.Pod
var err error
if updatedPod, err = kubeClient.CoreV1().Pods(pod.Namespace).UpdateStatus(pod); err != nil {
return nil, err
}
return updatedPod, nil
}
// MarkAllPodsNotReady updates ready status of all pods running on
// given node from master return true if success
func MarkAllPodsNotReady(kubeClient clientset.Interface, node *v1.Node) error {
nodeName := node.Name
klog.V(2).Infof("Update ready status of pods on node [%v]", nodeName)
opts := metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, nodeName).String()}
pods, err := kubeClient.CoreV1().Pods(metav1.NamespaceAll).List(opts)
if err != nil {
return err
}
errMsg := []string{}
for _, pod := range pods.Items {
// Defensive check, also needed for tests.
if pod.Spec.NodeName != nodeName {
continue
}
for i, cond := range pod.Status.Conditions {
if cond.Type == v1.PodReady {
pod.Status.Conditions[i].Status = v1.ConditionFalse
klog.V(2).Infof("Updating ready status of pod %v to false", pod.Name)
_, err := kubeClient.CoreV1().Pods(pod.Namespace).UpdateStatus(&pod)
if err != nil {
klog.Warningf("Failed to update status for pod %q: %v", format.Pod(&pod), err)
errMsg = append(errMsg, fmt.Sprintf("%v", err))
}
break
}
}
}
if len(errMsg) == 0 {
return nil
}
return fmt.Errorf("%v", strings.Join(errMsg, "; "))
}
// RecordNodeEvent records a event related to a node.
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
ref := &v1.ObjectReference{
Kind: "Node",
Name: nodeName,
UID: types.UID(nodeUID),
Namespace: "",
}
klog.V(2).Infof("Recording %s event message for node %s", event, nodeName)
recorder.Eventf(ref, eventtype, reason, "Node %s event: %s", nodeName, event)
}
// RecordNodeStatusChange records a event related to a node status change. (Common to lifecycle and ipam)
func RecordNodeStatusChange(recorder record.EventRecorder, node *v1.Node, newStatus string) {
ref := &v1.ObjectReference{
Kind: "Node",
Name: node.Name,
UID: node.UID,
Namespace: "",
}
klog.V(2).Infof("Recording status change %s event message for node %s", newStatus, node.Name)
// TODO: This requires a transaction, either both node status is updated
// and event is recorded or neither should happen, see issue #6055.
recorder.Eventf(ref, v1.EventTypeNormal, newStatus, "Node %s status is now: %s", node.Name, newStatus)
}
// SwapNodeControllerTaint returns true in case of success and false
// otherwise.
func SwapNodeControllerTaint(kubeClient clientset.Interface, taintsToAdd, taintsToRemove []*v1.Taint, node *v1.Node) bool {
for _, taintToAdd := range taintsToAdd {
now := metav1.Now()
taintToAdd.TimeAdded = &now
}
err := controller.AddOrUpdateTaintOnNode(kubeClient, node.Name, taintsToAdd...)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to taint %+v unresponsive Node %q: %v",
taintsToAdd,
node.Name,
err))
return false
}
klog.V(4).Infof("Added %+v Taint to Node %v", taintsToAdd, node.Name)
err = controller.RemoveTaintOffNode(kubeClient, node.Name, node, taintsToRemove...)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to remove %+v unneeded taint from unresponsive Node %q: %v",
taintsToRemove,
node.Name,
err))
return false
}
klog.V(4).Infof("Made sure that Node %+v has no %v Taint", node.Name, taintsToRemove)
return true
}
// AddOrUpdateLabelsOnNode updates the labels on the node and returns true on
// success and false on failure.
func AddOrUpdateLabelsOnNode(kubeClient clientset.Interface, labelsToUpdate map[string]string, node *v1.Node) bool {
err := controller.AddOrUpdateLabelsOnNode(kubeClient, node.Name, labelsToUpdate)
if err != nil {
utilruntime.HandleError(
fmt.Errorf(
"unable to update labels %+v for Node %q: %v",
labelsToUpdate,
node.Name,
err))
return false
}
klog.V(4).Infof("Updated labels %+v to Node %v", labelsToUpdate, node.Name)
return true
}
// CreateAddNodeHandler creates an add node handler.
func CreateAddNodeHandler(f func(node *v1.Node) error) func(obj interface{}) {
return func(originalObj interface{}) {
node := originalObj.(*v1.Node).DeepCopy()
if err := f(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add: %v", err))
}
}
}
// CreateUpdateNodeHandler creates a node update handler. (Common to lifecycle and ipam)
func CreateUpdateNodeHandler(f func(oldNode, newNode *v1.Node) error) func(oldObj, newObj interface{}) {
return func(origOldObj, origNewObj interface{}) {
node := origNewObj.(*v1.Node).DeepCopy()
prevNode := origOldObj.(*v1.Node).DeepCopy()
if err := f(prevNode, node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add/Delete: %v", err))
}
}
}
// CreateDeleteNodeHandler creates a delete node handler. (Common to lifecycle and ipam)
func CreateDeleteNodeHandler(f func(node *v1.Node) error) func(obj interface{}) {
return func(originalObj interface{}) {
originalNode, isNode := originalObj.(*v1.Node)
// We can get DeletedFinalStateUnknown instead of *v1.Node here and
// we need to handle that correctly. #34692
if !isNode {
deletedState, ok := originalObj.(cache.DeletedFinalStateUnknown)
if !ok {
klog.Errorf("Received unexpected object: %v", originalObj)
return
}
originalNode, ok = deletedState.Obj.(*v1.Node)
if !ok {
klog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
return
}
}
node := originalNode.DeepCopy()
if err := f(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error while processing Node Add/Delete: %v", err))
}
}
}
// GetNodeCondition extracts the provided condition from the given status and returns that.
// Returns nil and -1 if the condition is not present, and the index of the located condition.
func GetNodeCondition(status *v1.NodeStatus, conditionType v1.NodeConditionType) (int, *v1.NodeCondition) {
if status == nil {
return -1, nil
}
for i := range status.Conditions {
if status.Conditions[i].Type == conditionType {
return i, &status.Conditions[i]
}
}
return -1, nil
}

View File

@ -0,0 +1,34 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package events
const (
// volume relevant event reasons
FailedBinding = "FailedBinding"
VolumeMismatch = "VolumeMismatch"
VolumeFailedRecycle = "VolumeFailedRecycle"
VolumeRecycled = "VolumeRecycled"
RecyclerPod = "RecyclerPod"
VolumeDelete = "VolumeDelete"
VolumeFailedDelete = "VolumeFailedDelete"
ExternalProvisioning = "ExternalProvisioning"
ProvisioningFailed = "ProvisioningFailed"
ProvisioningCleanupFailed = "ProvisioningCleanupFailed"
ProvisioningSucceeded = "ProvisioningSucceeded"
WaitForFirstConsumer = "WaitForFirstConsumer"
ExternalExpanding = "ExternalExpanding"
)

View File

@ -0,0 +1,380 @@
/*
Copyright 2014 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"sort"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/cache"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
volumeutil "k8s.io/kubernetes/pkg/volume/util"
)
// persistentVolumeOrderedIndex is a cache.Store that keeps persistent volumes
// indexed by AccessModes and ordered by storage capacity.
type persistentVolumeOrderedIndex struct {
store cache.Indexer
}
func newPersistentVolumeOrderedIndex() persistentVolumeOrderedIndex {
return persistentVolumeOrderedIndex{cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{"accessmodes": accessModesIndexFunc})}
}
// accessModesIndexFunc is an indexing function that returns a persistent
// volume's AccessModes as a string
func accessModesIndexFunc(obj interface{}) ([]string, error) {
if pv, ok := obj.(*v1.PersistentVolume); ok {
modes := v1helper.GetAccessModesAsString(pv.Spec.AccessModes)
return []string{modes}, nil
}
return []string{""}, fmt.Errorf("object is not a persistent volume: %v", obj)
}
// listByAccessModes returns all volumes with the given set of
// AccessModeTypes. The list is unsorted!
func (pvIndex *persistentVolumeOrderedIndex) listByAccessModes(modes []v1.PersistentVolumeAccessMode) ([]*v1.PersistentVolume, error) {
pv := &v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{
AccessModes: modes,
},
}
objs, err := pvIndex.store.Index("accessmodes", pv)
if err != nil {
return nil, err
}
volumes := make([]*v1.PersistentVolume, len(objs))
for i, obj := range objs {
volumes[i] = obj.(*v1.PersistentVolume)
}
return volumes, nil
}
// find returns the nearest PV from the ordered list or nil if a match is not found
func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) {
// PVs are indexed by their access modes to allow easier searching. Each
// index is the string representation of a set of access modes. There is a
// finite number of possible sets and PVs will only be indexed in one of
// them (whichever index matches the PV's modes).
//
// A request for resources will always specify its desired access modes.
// Any matching PV must have at least that number of access modes, but it
// can have more. For example, a user asks for ReadWriteOnce but a GCEPD
// is available, which is ReadWriteOnce+ReadOnlyMany.
//
// Searches are performed against a set of access modes, so we can attempt
// not only the exact matching modes but also potential matches (the GCEPD
// example above).
allPossibleModes := pvIndex.allPossibleMatchingAccessModes(claim.Spec.AccessModes)
for _, modes := range allPossibleModes {
volumes, err := pvIndex.listByAccessModes(modes)
if err != nil {
return nil, err
}
bestVol, err := findMatchingVolume(claim, volumes, nil /* node for topology binding*/, nil /* exclusion map */, delayBinding)
if err != nil {
return nil, err
}
if bestVol != nil {
return bestVol, nil
}
}
return nil, nil
}
// findMatchingVolume goes through the list of volumes to find the best matching volume
// for the claim.
//
// This function is used by both the PV controller and scheduler.
//
// delayBinding is true only in the PV controller path. When set, prebound PVs are still returned
// as a match for the claim, but unbound PVs are skipped.
//
// node is set only in the scheduler path. When set, the PV node affinity is checked against
// the node's labels.
//
// excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple
// unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen
// PV needs to be excluded from future matching.
func findMatchingVolume(
claim *v1.PersistentVolumeClaim,
volumes []*v1.PersistentVolume,
node *v1.Node,
excludedVolumes map[string]*v1.PersistentVolume,
delayBinding bool) (*v1.PersistentVolume, error) {
var smallestVolume *v1.PersistentVolume
var smallestVolumeQty resource.Quantity
requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
requestedClass := v1helper.GetPersistentVolumeClaimClass(claim)
var selector labels.Selector
if claim.Spec.Selector != nil {
internalSelector, err := metav1.LabelSelectorAsSelector(claim.Spec.Selector)
if err != nil {
// should be unreachable code due to validation
return nil, fmt.Errorf("error creating internal label selector for claim: %v: %v", claimToClaimKey(claim), err)
}
selector = internalSelector
}
// Go through all available volumes with two goals:
// - find a volume that is either pre-bound by user or dynamically
// provisioned for this claim. Because of this we need to loop through
// all volumes.
// - find the smallest matching one if there is no volume pre-bound to
// the claim.
for _, volume := range volumes {
if _, ok := excludedVolumes[volume.Name]; ok {
// Skip volumes in the excluded list
continue
}
volumeQty := volume.Spec.Capacity[v1.ResourceStorage]
// check if volumeModes do not match (feature gate protected)
isMismatch, err := checkVolumeModeMismatches(&claim.Spec, &volume.Spec)
if err != nil {
return nil, fmt.Errorf("error checking if volumeMode was a mismatch: %v", err)
}
// filter out mismatching volumeModes
if isMismatch {
continue
}
// check if PV's DeletionTimeStamp is set, if so, skip this volume.
if utilfeature.DefaultFeatureGate.Enabled(features.StorageObjectInUseProtection) {
if volume.ObjectMeta.DeletionTimestamp != nil {
continue
}
}
nodeAffinityValid := true
if node != nil {
// Scheduler path, check that the PV NodeAffinity
// is satisfied by the node
err := volumeutil.CheckNodeAffinity(volume, node.Labels)
if err != nil {
nodeAffinityValid = false
}
}
if IsVolumeBoundToClaim(volume, claim) {
// this claim and volume are pre-bound; return
// the volume if the size request is satisfied,
// otherwise continue searching for a match
if volumeQty.Cmp(requestedQty) < 0 {
continue
}
// If PV node affinity is invalid, return no match.
// This means the prebound PV (and therefore PVC)
// is not suitable for this node.
if !nodeAffinityValid {
return nil, nil
}
return volume, nil
}
if node == nil && delayBinding {
// PV controller does not bind this claim.
// Scheduler will handle binding unbound volumes
// Scheduler path will have node != nil
continue
}
// filter out:
// - volumes in non-available phase
// - volumes bound to another claim
// - volumes whose labels don't match the claim's selector, if specified
// - volumes in Class that is not requested
// - volumes whose NodeAffinity does not match the node
if volume.Status.Phase != v1.VolumeAvailable {
// We ignore volumes in non-available phase, because volumes that
// satisfies matching criteria will be updated to available, binding
// them now has high chance of encountering unnecessary failures
// due to API conflicts.
continue
} else if volume.Spec.ClaimRef != nil {
continue
} else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) {
continue
}
if v1helper.GetPersistentVolumeClass(volume) != requestedClass {
continue
}
if !nodeAffinityValid {
continue
}
if node != nil {
// Scheduler path
// Check that the access modes match
if !checkAccessModes(claim, volume) {
continue
}
}
if volumeQty.Cmp(requestedQty) >= 0 {
if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 {
smallestVolume = volume
smallestVolumeQty = volumeQty
}
}
}
if smallestVolume != nil {
// Found a matching volume
return smallestVolume, nil
}
return nil, nil
}
// checkVolumeModeMismatches is a convenience method that checks volumeMode for PersistentVolume
// and PersistentVolumeClaims
func checkVolumeModeMismatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1.PersistentVolumeSpec) (bool, error) {
if !utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
return false, nil
}
// In HA upgrades, we cannot guarantee that the apiserver is on a version >= controller-manager.
// So we default a nil volumeMode to filesystem
requestedVolumeMode := v1.PersistentVolumeFilesystem
if pvcSpec.VolumeMode != nil {
requestedVolumeMode = *pvcSpec.VolumeMode
}
pvVolumeMode := v1.PersistentVolumeFilesystem
if pvSpec.VolumeMode != nil {
pvVolumeMode = *pvSpec.VolumeMode
}
return requestedVolumeMode != pvVolumeMode, nil
}
// findBestMatchForClaim is a convenience method that finds a volume by the claim's AccessModes and requests for Storage
func (pvIndex *persistentVolumeOrderedIndex) findBestMatchForClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) {
return pvIndex.findByClaim(claim, delayBinding)
}
// allPossibleMatchingAccessModes returns an array of AccessMode arrays that
// can satisfy a user's requested modes.
//
// see comments in the Find func above regarding indexing.
//
// allPossibleMatchingAccessModes gets all stringified accessmodes from the
// index and returns all those that contain at least all of the requested
// mode.
//
// For example, assume the index contains 2 types of PVs where the stringified
// accessmodes are:
//
// "RWO,ROX" -- some number of GCEPDs
// "RWO,ROX,RWX" -- some number of NFS volumes
//
// A request for RWO could be satisfied by both sets of indexed volumes, so
// allPossibleMatchingAccessModes returns:
//
// [][]v1.PersistentVolumeAccessMode {
// []v1.PersistentVolumeAccessMode {
// v1.ReadWriteOnce, v1.ReadOnlyMany,
// },
// []v1.PersistentVolumeAccessMode {
// v1.ReadWriteOnce, v1.ReadOnlyMany, v1.ReadWriteMany,
// },
// }
//
// A request for RWX can be satisfied by only one set of indexed volumes, so
// the return is:
//
// [][]v1.PersistentVolumeAccessMode {
// []v1.PersistentVolumeAccessMode {
// v1.ReadWriteOnce, v1.ReadOnlyMany, v1.ReadWriteMany,
// },
// }
//
// This func returns modes with ascending levels of modes to give the user
// what is closest to what they actually asked for.
func (pvIndex *persistentVolumeOrderedIndex) allPossibleMatchingAccessModes(requestedModes []v1.PersistentVolumeAccessMode) [][]v1.PersistentVolumeAccessMode {
matchedModes := [][]v1.PersistentVolumeAccessMode{}
keys := pvIndex.store.ListIndexFuncValues("accessmodes")
for _, key := range keys {
indexedModes := v1helper.GetAccessModesFromString(key)
if volumeutil.AccessModesContainedInAll(indexedModes, requestedModes) {
matchedModes = append(matchedModes, indexedModes)
}
}
// sort by the number of modes in each array with the fewest number of
// modes coming first. this allows searching for volumes by the minimum
// number of modes required of the possible matches.
sort.Sort(byAccessModes{matchedModes})
return matchedModes
}
// byAccessModes is used to order access modes by size, with the fewest modes first
type byAccessModes struct {
modes [][]v1.PersistentVolumeAccessMode
}
func (c byAccessModes) Less(i, j int) bool {
return len(c.modes[i]) < len(c.modes[j])
}
func (c byAccessModes) Swap(i, j int) {
c.modes[i], c.modes[j] = c.modes[j], c.modes[i]
}
func (c byAccessModes) Len() int {
return len(c.modes)
}
func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
}
func claimrefToClaimKey(claimref *v1.ObjectReference) string {
return fmt.Sprintf("%s/%s", claimref.Namespace, claimref.Name)
}
// Returns true if PV satisfies all the PVC's requested AccessModes
func checkAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool {
pvModesMap := map[v1.PersistentVolumeAccessMode]bool{}
for _, mode := range volume.Spec.AccessModes {
pvModesMap[mode] = true
}
for _, mode := range claim.Spec.AccessModes {
_, ok := pvModesMap[mode]
if !ok {
return false
}
}
return true
}

View File

@ -0,0 +1,211 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"k8s.io/api/core/v1"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/klog"
)
const (
// Subsystem names.
pvControllerSubsystem = "pv_collector"
// Metric names.
boundPVKey = "bound_pv_count"
unboundPVKey = "unbound_pv_count"
boundPVCKey = "bound_pvc_count"
unboundPVCKey = "unbound_pvc_count"
// Label names.
namespaceLabel = "namespace"
storageClassLabel = "storage_class"
)
var registerMetrics sync.Once
// PVLister used to list persistent volumes.
type PVLister interface {
List() []interface{}
}
// PVCLister used to list persistent volume claims.
type PVCLister interface {
List() []interface{}
}
// Register all metrics for pv controller.
func Register(pvLister PVLister, pvcLister PVCLister) {
registerMetrics.Do(func() {
prometheus.MustRegister(newPVAndPVCCountCollector(pvLister, pvcLister))
prometheus.MustRegister(volumeOperationMetric)
prometheus.MustRegister(volumeOperationErrorsMetric)
})
}
func newPVAndPVCCountCollector(pvLister PVLister, pvcLister PVCLister) *pvAndPVCCountCollector {
return &pvAndPVCCountCollector{pvLister, pvcLister}
}
// Custom collector for current pod and container counts.
type pvAndPVCCountCollector struct {
// Cache for accessing information about PersistentVolumes.
pvLister PVLister
// Cache for accessing information about PersistentVolumeClaims.
pvcLister PVCLister
}
var (
boundPVCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", pvControllerSubsystem, boundPVKey),
"Gauge measuring number of persistent volume currently bound",
[]string{storageClassLabel}, nil)
unboundPVCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", pvControllerSubsystem, unboundPVKey),
"Gauge measuring number of persistent volume currently unbound",
[]string{storageClassLabel}, nil)
boundPVCCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", pvControllerSubsystem, boundPVCKey),
"Gauge measuring number of persistent volume claim currently bound",
[]string{namespaceLabel}, nil)
unboundPVCCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", pvControllerSubsystem, unboundPVCKey),
"Gauge measuring number of persistent volume claim currently unbound",
[]string{namespaceLabel}, nil)
volumeOperationMetric = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "volume_operation_total_seconds",
Help: "Total volume operation time",
},
[]string{"plugin_name", "operation_name"})
volumeOperationErrorsMetric = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "volume_operation_total_errors",
Help: "Total volume operation erros",
},
[]string{"plugin_name", "operation_name"})
)
func (collector *pvAndPVCCountCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- boundPVCountDesc
ch <- unboundPVCountDesc
ch <- boundPVCCountDesc
ch <- unboundPVCCountDesc
}
func (collector *pvAndPVCCountCollector) Collect(ch chan<- prometheus.Metric) {
collector.pvCollect(ch)
collector.pvcCollect(ch)
}
func (collector *pvAndPVCCountCollector) pvCollect(ch chan<- prometheus.Metric) {
boundNumberByStorageClass := make(map[string]int)
unboundNumberByStorageClass := make(map[string]int)
for _, pvObj := range collector.pvLister.List() {
pv, ok := pvObj.(*v1.PersistentVolume)
if !ok {
continue
}
if pv.Status.Phase == v1.VolumeBound {
boundNumberByStorageClass[pv.Spec.StorageClassName]++
} else {
unboundNumberByStorageClass[pv.Spec.StorageClassName]++
}
}
for storageClassName, number := range boundNumberByStorageClass {
metric, err := prometheus.NewConstMetric(
boundPVCountDesc,
prometheus.GaugeValue,
float64(number),
storageClassName)
if err != nil {
klog.Warningf("Create bound pv number metric failed: %v", err)
continue
}
ch <- metric
}
for storageClassName, number := range unboundNumberByStorageClass {
metric, err := prometheus.NewConstMetric(
unboundPVCountDesc,
prometheus.GaugeValue,
float64(number),
storageClassName)
if err != nil {
klog.Warningf("Create unbound pv number metric failed: %v", err)
continue
}
ch <- metric
}
}
func (collector *pvAndPVCCountCollector) pvcCollect(ch chan<- prometheus.Metric) {
boundNumberByNamespace := make(map[string]int)
unboundNumberByNamespace := make(map[string]int)
for _, pvcObj := range collector.pvcLister.List() {
pvc, ok := pvcObj.(*v1.PersistentVolumeClaim)
if !ok {
continue
}
if pvc.Status.Phase == v1.ClaimBound {
boundNumberByNamespace[pvc.Namespace]++
} else {
unboundNumberByNamespace[pvc.Namespace]++
}
}
for namespace, number := range boundNumberByNamespace {
metric, err := prometheus.NewConstMetric(
boundPVCCountDesc,
prometheus.GaugeValue,
float64(number),
namespace)
if err != nil {
klog.Warningf("Create bound pvc number metric failed: %v", err)
continue
}
ch <- metric
}
for namespace, number := range unboundNumberByNamespace {
metric, err := prometheus.NewConstMetric(
unboundPVCCountDesc,
prometheus.GaugeValue,
float64(number),
namespace)
if err != nil {
klog.Warningf("Create unbound pvc number metric failed: %v", err)
continue
}
ch <- metric
}
}
// RecordVolumeOperationMetric records the latency and errors of volume operations.
func RecordVolumeOperationMetric(pluginName, opName string, timeTaken float64, err error) {
if pluginName == "" {
pluginName = "N/A"
}
if err != nil {
volumeOperationErrorsMetric.WithLabelValues(pluginName, opName).Inc()
return
}
volumeOperationMetric.WithLabelValues(pluginName, opName).Observe(timeTaken)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,523 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"strconv"
"time"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/volume/persistentvolume/metrics"
"k8s.io/kubernetes/pkg/util/goroutinemap"
vol "k8s.io/kubernetes/pkg/volume"
"k8s.io/klog"
)
// This file contains the controller base functionality, i.e. framework to
// process PV/PVC added/updated/deleted events. The real binding, provisioning,
// recycling and deleting is done in pv_controller.go
// ControllerParameters contains arguments for creation of a new
// PersistentVolume controller.
type ControllerParameters struct {
KubeClient clientset.Interface
SyncPeriod time.Duration
VolumePlugins []vol.VolumePlugin
Cloud cloudprovider.Interface
ClusterName string
VolumeInformer coreinformers.PersistentVolumeInformer
ClaimInformer coreinformers.PersistentVolumeClaimInformer
ClassInformer storageinformers.StorageClassInformer
PodInformer coreinformers.PodInformer
NodeInformer coreinformers.NodeInformer
EventRecorder record.EventRecorder
EnableDynamicProvisioning bool
}
// NewController creates a new PersistentVolume controller
func NewController(p ControllerParameters) (*PersistentVolumeController, error) {
eventRecorder := p.EventRecorder
if eventRecorder == nil {
broadcaster := record.NewBroadcaster()
broadcaster.StartLogging(klog.Infof)
broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: p.KubeClient.CoreV1().Events("")})
eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "persistentvolume-controller"})
}
controller := &PersistentVolumeController{
volumes: newPersistentVolumeOrderedIndex(),
claims: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
kubeClient: p.KubeClient,
eventRecorder: eventRecorder,
runningOperations: goroutinemap.NewGoRoutineMap(true /* exponentialBackOffOnError */),
cloud: p.Cloud,
enableDynamicProvisioning: p.EnableDynamicProvisioning,
clusterName: p.ClusterName,
createProvisionedPVRetryCount: createProvisionedPVRetryCount,
createProvisionedPVInterval: createProvisionedPVInterval,
claimQueue: workqueue.NewNamed("claims"),
volumeQueue: workqueue.NewNamed("volumes"),
resyncPeriod: p.SyncPeriod,
}
// Prober is nil because PV is not aware of Flexvolume.
if err := controller.volumePluginMgr.InitPlugins(p.VolumePlugins, nil /* prober */, controller); err != nil {
return nil, fmt.Errorf("Could not initialize volume plugins for PersistentVolume Controller: %v", err)
}
p.VolumeInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { controller.enqueueWork(controller.volumeQueue, obj) },
UpdateFunc: func(oldObj, newObj interface{}) { controller.enqueueWork(controller.volumeQueue, newObj) },
DeleteFunc: func(obj interface{}) { controller.enqueueWork(controller.volumeQueue, obj) },
},
)
controller.volumeLister = p.VolumeInformer.Lister()
controller.volumeListerSynced = p.VolumeInformer.Informer().HasSynced
p.ClaimInformer.Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { controller.enqueueWork(controller.claimQueue, obj) },
UpdateFunc: func(oldObj, newObj interface{}) { controller.enqueueWork(controller.claimQueue, newObj) },
DeleteFunc: func(obj interface{}) { controller.enqueueWork(controller.claimQueue, obj) },
},
)
controller.claimLister = p.ClaimInformer.Lister()
controller.claimListerSynced = p.ClaimInformer.Informer().HasSynced
controller.classLister = p.ClassInformer.Lister()
controller.classListerSynced = p.ClassInformer.Informer().HasSynced
controller.podLister = p.PodInformer.Lister()
controller.podListerSynced = p.PodInformer.Informer().HasSynced
controller.NodeLister = p.NodeInformer.Lister()
controller.NodeListerSynced = p.NodeInformer.Informer().HasSynced
return controller, nil
}
// initializeCaches fills all controller caches with initial data from etcd in
// order to have the caches already filled when first addClaim/addVolume to
// perform initial synchronization of the controller.
func (ctrl *PersistentVolumeController) initializeCaches(volumeLister corelisters.PersistentVolumeLister, claimLister corelisters.PersistentVolumeClaimLister) {
volumeList, err := volumeLister.List(labels.Everything())
if err != nil {
klog.Errorf("PersistentVolumeController can't initialize caches: %v", err)
return
}
for _, volume := range volumeList {
volumeClone := volume.DeepCopy()
if _, err = ctrl.storeVolumeUpdate(volumeClone); err != nil {
klog.Errorf("error updating volume cache: %v", err)
}
}
claimList, err := claimLister.List(labels.Everything())
if err != nil {
klog.Errorf("PersistentVolumeController can't initialize caches: %v", err)
return
}
for _, claim := range claimList {
if _, err = ctrl.storeClaimUpdate(claim.DeepCopy()); err != nil {
klog.Errorf("error updating claim cache: %v", err)
}
}
klog.V(4).Infof("controller initialized")
}
// enqueueWork adds volume or claim to given work queue.
func (ctrl *PersistentVolumeController) enqueueWork(queue workqueue.Interface, obj interface{}) {
// Beware of "xxx deleted" events
if unknown, ok := obj.(cache.DeletedFinalStateUnknown); ok && unknown.Obj != nil {
obj = unknown.Obj
}
objName, err := controller.KeyFunc(obj)
if err != nil {
klog.Errorf("failed to get key from object: %v", err)
return
}
klog.V(5).Infof("enqueued %q for sync", objName)
queue.Add(objName)
}
func (ctrl *PersistentVolumeController) storeVolumeUpdate(volume interface{}) (bool, error) {
return storeObjectUpdate(ctrl.volumes.store, volume, "volume")
}
func (ctrl *PersistentVolumeController) storeClaimUpdate(claim interface{}) (bool, error) {
return storeObjectUpdate(ctrl.claims, claim, "claim")
}
// updateVolume runs in worker thread and handles "volume added",
// "volume updated" and "periodic sync" events.
func (ctrl *PersistentVolumeController) updateVolume(volume *v1.PersistentVolume) {
// Store the new volume version in the cache and do not process it if this
// is an old version.
new, err := ctrl.storeVolumeUpdate(volume)
if err != nil {
klog.Errorf("%v", err)
}
if !new {
return
}
err = ctrl.syncVolume(volume)
if err != nil {
if errors.IsConflict(err) {
// Version conflict error happens quite often and the controller
// recovers from it easily.
klog.V(3).Infof("could not sync volume %q: %+v", volume.Name, err)
} else {
klog.Errorf("could not sync volume %q: %+v", volume.Name, err)
}
}
}
// deleteVolume runs in worker thread and handles "volume deleted" event.
func (ctrl *PersistentVolumeController) deleteVolume(volume *v1.PersistentVolume) {
_ = ctrl.volumes.store.Delete(volume)
klog.V(4).Infof("volume %q deleted", volume.Name)
if volume.Spec.ClaimRef == nil {
return
}
// sync the claim when its volume is deleted. Explicitly syncing the
// claim here in response to volume deletion prevents the claim from
// waiting until the next sync period for its Lost status.
claimKey := claimrefToClaimKey(volume.Spec.ClaimRef)
klog.V(5).Infof("deleteVolume[%s]: scheduling sync of claim %q", volume.Name, claimKey)
ctrl.claimQueue.Add(claimKey)
}
// updateClaim runs in worker thread and handles "claim added",
// "claim updated" and "periodic sync" events.
func (ctrl *PersistentVolumeController) updateClaim(claim *v1.PersistentVolumeClaim) {
// Store the new claim version in the cache and do not process it if this is
// an old version.
new, err := ctrl.storeClaimUpdate(claim)
if err != nil {
klog.Errorf("%v", err)
}
if !new {
return
}
err = ctrl.syncClaim(claim)
if err != nil {
if errors.IsConflict(err) {
// Version conflict error happens quite often and the controller
// recovers from it easily.
klog.V(3).Infof("could not sync claim %q: %+v", claimToClaimKey(claim), err)
} else {
klog.Errorf("could not sync volume %q: %+v", claimToClaimKey(claim), err)
}
}
}
// deleteClaim runs in worker thread and handles "claim deleted" event.
func (ctrl *PersistentVolumeController) deleteClaim(claim *v1.PersistentVolumeClaim) {
_ = ctrl.claims.Delete(claim)
klog.V(4).Infof("claim %q deleted", claimToClaimKey(claim))
volumeName := claim.Spec.VolumeName
if volumeName == "" {
klog.V(5).Infof("deleteClaim[%q]: volume not bound", claimToClaimKey(claim))
return
}
// sync the volume when its claim is deleted. Explicitly sync'ing the
// volume here in response to claim deletion prevents the volume from
// waiting until the next sync period for its Release.
klog.V(5).Infof("deleteClaim[%q]: scheduling sync of volume %s", claimToClaimKey(claim), volumeName)
ctrl.volumeQueue.Add(volumeName)
}
// Run starts all of this controller's control loops
func (ctrl *PersistentVolumeController) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer ctrl.claimQueue.ShutDown()
defer ctrl.volumeQueue.ShutDown()
klog.Infof("Starting persistent volume controller")
defer klog.Infof("Shutting down persistent volume controller")
if !controller.WaitForCacheSync("persistent volume", stopCh, ctrl.volumeListerSynced, ctrl.claimListerSynced, ctrl.classListerSynced, ctrl.podListerSynced, ctrl.NodeListerSynced) {
return
}
ctrl.initializeCaches(ctrl.volumeLister, ctrl.claimLister)
go wait.Until(ctrl.resync, ctrl.resyncPeriod, stopCh)
go wait.Until(ctrl.volumeWorker, time.Second, stopCh)
go wait.Until(ctrl.claimWorker, time.Second, stopCh)
metrics.Register(ctrl.volumes.store, ctrl.claims)
<-stopCh
}
// volumeWorker processes items from volumeQueue. It must run only once,
// syncVolume is not assured to be reentrant.
func (ctrl *PersistentVolumeController) volumeWorker() {
workFunc := func() bool {
keyObj, quit := ctrl.volumeQueue.Get()
if quit {
return true
}
defer ctrl.volumeQueue.Done(keyObj)
key := keyObj.(string)
klog.V(5).Infof("volumeWorker[%s]", key)
_, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
klog.V(4).Infof("error getting name of volume %q to get volume from informer: %v", key, err)
return false
}
volume, err := ctrl.volumeLister.Get(name)
if err == nil {
// The volume still exists in informer cache, the event must have
// been add/update/sync
ctrl.updateVolume(volume)
return false
}
if !errors.IsNotFound(err) {
klog.V(2).Infof("error getting volume %q from informer: %v", key, err)
return false
}
// The volume is not in informer cache, the event must have been
// "delete"
volumeObj, found, err := ctrl.volumes.store.GetByKey(key)
if err != nil {
klog.V(2).Infof("error getting volume %q from cache: %v", key, err)
return false
}
if !found {
// The controller has already processed the delete event and
// deleted the volume from its cache
klog.V(2).Infof("deletion of volume %q was already processed", key)
return false
}
volume, ok := volumeObj.(*v1.PersistentVolume)
if !ok {
klog.Errorf("expected volume, got %+v", volumeObj)
return false
}
ctrl.deleteVolume(volume)
return false
}
for {
if quit := workFunc(); quit {
klog.Infof("volume worker queue shutting down")
return
}
}
}
// claimWorker processes items from claimQueue. It must run only once,
// syncClaim is not reentrant.
func (ctrl *PersistentVolumeController) claimWorker() {
workFunc := func() bool {
keyObj, quit := ctrl.claimQueue.Get()
if quit {
return true
}
defer ctrl.claimQueue.Done(keyObj)
key := keyObj.(string)
klog.V(5).Infof("claimWorker[%s]", key)
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
klog.V(4).Infof("error getting namespace & name of claim %q to get claim from informer: %v", key, err)
return false
}
claim, err := ctrl.claimLister.PersistentVolumeClaims(namespace).Get(name)
if err == nil {
// The claim still exists in informer cache, the event must have
// been add/update/sync
ctrl.updateClaim(claim)
return false
}
if !errors.IsNotFound(err) {
klog.V(2).Infof("error getting claim %q from informer: %v", key, err)
return false
}
// The claim is not in informer cache, the event must have been "delete"
claimObj, found, err := ctrl.claims.GetByKey(key)
if err != nil {
klog.V(2).Infof("error getting claim %q from cache: %v", key, err)
return false
}
if !found {
// The controller has already processed the delete event and
// deleted the claim from its cache
klog.V(2).Infof("deletion of claim %q was already processed", key)
return false
}
claim, ok := claimObj.(*v1.PersistentVolumeClaim)
if !ok {
klog.Errorf("expected claim, got %+v", claimObj)
return false
}
ctrl.deleteClaim(claim)
return false
}
for {
if quit := workFunc(); quit {
klog.Infof("claim worker queue shutting down")
return
}
}
}
// resync supplements short resync period of shared informers - we don't want
// all consumers of PV/PVC shared informer to have a short resync period,
// therefore we do our own.
func (ctrl *PersistentVolumeController) resync() {
klog.V(4).Infof("resyncing PV controller")
pvcs, err := ctrl.claimLister.List(labels.NewSelector())
if err != nil {
klog.Warningf("cannot list claims: %s", err)
return
}
for _, pvc := range pvcs {
ctrl.enqueueWork(ctrl.claimQueue, pvc)
}
pvs, err := ctrl.volumeLister.List(labels.NewSelector())
if err != nil {
klog.Warningf("cannot list persistent volumes: %s", err)
return
}
for _, pv := range pvs {
ctrl.enqueueWork(ctrl.volumeQueue, pv)
}
}
// setClaimProvisioner saves
// claim.Annotations[annStorageProvisioner] = class.Provisioner
func (ctrl *PersistentVolumeController) setClaimProvisioner(claim *v1.PersistentVolumeClaim, provisionerName string) (*v1.PersistentVolumeClaim, error) {
if val, ok := claim.Annotations[annStorageProvisioner]; ok && val == provisionerName {
// annotation is already set, nothing to do
return claim, nil
}
// The volume from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
claimClone := claim.DeepCopy()
metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, annStorageProvisioner, provisionerName)
newClaim, err := ctrl.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(claimClone)
if err != nil {
return newClaim, err
}
_, err = ctrl.storeClaimUpdate(newClaim)
if err != nil {
return newClaim, err
}
return newClaim, nil
}
// Stateless functions
func getClaimStatusForLogging(claim *v1.PersistentVolumeClaim) string {
bound := metav1.HasAnnotation(claim.ObjectMeta, annBindCompleted)
boundByController := metav1.HasAnnotation(claim.ObjectMeta, annBoundByController)
return fmt.Sprintf("phase: %s, bound to: %q, bindCompleted: %v, boundByController: %v", claim.Status.Phase, claim.Spec.VolumeName, bound, boundByController)
}
func getVolumeStatusForLogging(volume *v1.PersistentVolume) string {
boundByController := metav1.HasAnnotation(volume.ObjectMeta, annBoundByController)
claimName := ""
if volume.Spec.ClaimRef != nil {
claimName = fmt.Sprintf("%s/%s (uid: %s)", volume.Spec.ClaimRef.Namespace, volume.Spec.ClaimRef.Name, volume.Spec.ClaimRef.UID)
}
return fmt.Sprintf("phase: %s, bound to: %q, boundByController: %v", volume.Status.Phase, claimName, boundByController)
}
// storeObjectUpdate updates given cache with a new object version from Informer
// callback (i.e. with events from etcd) or with an object modified by the
// controller itself. Returns "true", if the cache was updated, false if the
// object is an old version and should be ignored.
func storeObjectUpdate(store cache.Store, obj interface{}, className string) (bool, error) {
objName, err := controller.KeyFunc(obj)
if err != nil {
return false, fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)
}
oldObj, found, err := store.Get(obj)
if err != nil {
return false, fmt.Errorf("Error finding %s %q in controller cache: %v", className, objName, err)
}
objAccessor, err := meta.Accessor(obj)
if err != nil {
return false, err
}
if !found {
// This is a new object
klog.V(4).Infof("storeObjectUpdate: adding %s %q, version %s", className, objName, objAccessor.GetResourceVersion())
if err = store.Add(obj); err != nil {
return false, fmt.Errorf("Error adding %s %q to controller cache: %v", className, objName, err)
}
return true, nil
}
oldObjAccessor, err := meta.Accessor(oldObj)
if err != nil {
return false, err
}
objResourceVersion, err := strconv.ParseInt(objAccessor.GetResourceVersion(), 10, 64)
if err != nil {
return false, fmt.Errorf("Error parsing ResourceVersion %q of %s %q: %s", objAccessor.GetResourceVersion(), className, objName, err)
}
oldObjResourceVersion, err := strconv.ParseInt(oldObjAccessor.GetResourceVersion(), 10, 64)
if err != nil {
return false, fmt.Errorf("Error parsing old ResourceVersion %q of %s %q: %s", oldObjAccessor.GetResourceVersion(), className, objName, err)
}
// Throw away only older version, let the same version pass - we do want to
// get periodic sync events.
if oldObjResourceVersion > objResourceVersion {
klog.V(4).Infof("storeObjectUpdate: ignoring %s %q version %s", className, objName, objAccessor.GetResourceVersion())
return false, nil
}
klog.V(4).Infof("storeObjectUpdate updating %s %q with version %s", className, objName, objAccessor.GetResourceVersion())
if err = store.Update(obj); err != nil {
return false, fmt.Errorf("Error updating %s %q in controller cache: %v", className, objName, err)
}
return true, nil
}

View File

@ -0,0 +1,444 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"strconv"
"sync"
"k8s.io/klog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/tools/cache"
)
// AssumeCache is a cache on top of the informer that allows for updating
// objects outside of informer events and also restoring the informer
// cache's version of the object. Objects are assumed to be
// Kubernetes API objects that implement meta.Interface
type AssumeCache interface {
// Assume updates the object in-memory only
Assume(obj interface{}) error
// Restore the informer cache's version of the object
Restore(objName string)
// Get the object by name
Get(objName string) (interface{}, error)
// Get the API object by name
GetAPIObj(objName string) (interface{}, error)
// List all the objects in the cache
List(indexObj interface{}) []interface{}
}
type errWrongType struct {
typeName string
object interface{}
}
func (e *errWrongType) Error() string {
return fmt.Sprintf("could not convert object to type %v: %+v", e.typeName, e.object)
}
type errNotFound struct {
typeName string
objectName string
}
func (e *errNotFound) Error() string {
return fmt.Sprintf("could not find %v %q", e.typeName, e.objectName)
}
type errObjectName struct {
detailedErr error
}
func (e *errObjectName) Error() string {
return fmt.Sprintf("failed to get object name: %v", e.detailedErr)
}
// assumeCache stores two pointers to represent a single object:
// * The pointer to the informer object.
// * The pointer to the latest object, which could be the same as
// the informer object, or an in-memory object.
//
// An informer update always overrides the latest object pointer.
//
// Assume() only updates the latest object pointer.
// Restore() sets the latest object pointer back to the informer object.
// Get/List() always returns the latest object pointer.
type assumeCache struct {
// Synchronizes updates to store
rwMutex sync.RWMutex
// describes the object stored
description string
// Stores objInfo pointers
store cache.Indexer
// Index function for object
indexFunc cache.IndexFunc
indexName string
}
type objInfo struct {
// name of the object
name string
// Latest version of object could be cached-only or from informer
latestObj interface{}
// Latest object from informer
apiObj interface{}
}
func objInfoKeyFunc(obj interface{}) (string, error) {
objInfo, ok := obj.(*objInfo)
if !ok {
return "", &errWrongType{"objInfo", obj}
}
return objInfo.name, nil
}
func (c *assumeCache) objInfoIndexFunc(obj interface{}) ([]string, error) {
objInfo, ok := obj.(*objInfo)
if !ok {
return []string{""}, &errWrongType{"objInfo", obj}
}
return c.indexFunc(objInfo.latestObj)
}
func NewAssumeCache(informer cache.SharedIndexInformer, description, indexName string, indexFunc cache.IndexFunc) *assumeCache {
c := &assumeCache{
description: description,
indexFunc: indexFunc,
indexName: indexName,
}
c.store = cache.NewIndexer(objInfoKeyFunc, cache.Indexers{indexName: c.objInfoIndexFunc})
// Unit tests don't use informers
if informer != nil {
informer.AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: c.add,
UpdateFunc: c.update,
DeleteFunc: c.delete,
},
)
}
return c
}
func (c *assumeCache) add(obj interface{}) {
if obj == nil {
return
}
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("add failed: %v", &errObjectName{err})
return
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
if objInfo, _ := c.getObjInfo(name); objInfo != nil {
newVersion, err := c.getObjVersion(name, obj)
if err != nil {
klog.Errorf("add: couldn't get object version: %v", err)
return
}
storedVersion, err := c.getObjVersion(name, objInfo.latestObj)
if err != nil {
klog.Errorf("add: couldn't get stored object version: %v", err)
return
}
// Only update object if version is newer.
// This is so we don't override assumed objects due to informer resync.
if newVersion <= storedVersion {
klog.V(10).Infof("Skip adding %v %v to assume cache because version %v is not newer than %v", c.description, name, newVersion, storedVersion)
return
}
}
objInfo := &objInfo{name: name, latestObj: obj, apiObj: obj}
c.store.Update(objInfo)
klog.V(10).Infof("Adding %v %v to assume cache: %+v ", c.description, name, obj)
}
func (c *assumeCache) update(oldObj interface{}, newObj interface{}) {
c.add(newObj)
}
func (c *assumeCache) delete(obj interface{}) {
if obj == nil {
return
}
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("delete failed: %v", &errObjectName{err})
return
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo := &objInfo{name: name}
err = c.store.Delete(objInfo)
if err != nil {
klog.Errorf("delete: failed to delete %v %v: %v", c.description, name, err)
}
}
func (c *assumeCache) getObjVersion(name string, obj interface{}) (int64, error) {
objAccessor, err := meta.Accessor(obj)
if err != nil {
return -1, err
}
objResourceVersion, err := strconv.ParseInt(objAccessor.GetResourceVersion(), 10, 64)
if err != nil {
return -1, fmt.Errorf("error parsing ResourceVersion %q for %v %q: %s", objAccessor.GetResourceVersion(), c.description, name, err)
}
return objResourceVersion, nil
}
func (c *assumeCache) getObjInfo(name string) (*objInfo, error) {
obj, ok, err := c.store.GetByKey(name)
if err != nil {
return nil, err
}
if !ok {
return nil, &errNotFound{c.description, name}
}
objInfo, ok := obj.(*objInfo)
if !ok {
return nil, &errWrongType{"objInfo", obj}
}
return objInfo, nil
}
func (c *assumeCache) Get(objName string) (interface{}, error) {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
return nil, err
}
return objInfo.latestObj, nil
}
func (c *assumeCache) GetAPIObj(objName string) (interface{}, error) {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
return nil, err
}
return objInfo.apiObj, nil
}
func (c *assumeCache) List(indexObj interface{}) []interface{} {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
allObjs := []interface{}{}
objs, err := c.store.Index(c.indexName, &objInfo{latestObj: indexObj})
if err != nil {
klog.Errorf("list index error: %v", err)
return nil
}
for _, obj := range objs {
objInfo, ok := obj.(*objInfo)
if !ok {
klog.Errorf("list error: %v", &errWrongType{"objInfo", obj})
continue
}
allObjs = append(allObjs, objInfo.latestObj)
}
return allObjs
}
func (c *assumeCache) Assume(obj interface{}) error {
name, err := cache.MetaNamespaceKeyFunc(obj)
if err != nil {
return &errObjectName{err}
}
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo, err := c.getObjInfo(name)
if err != nil {
return err
}
newVersion, err := c.getObjVersion(name, obj)
if err != nil {
return err
}
storedVersion, err := c.getObjVersion(name, objInfo.latestObj)
if err != nil {
return err
}
if newVersion < storedVersion {
return fmt.Errorf("%v %q is out of sync (stored: %d, assume: %d)", c.description, name, storedVersion, newVersion)
}
// Only update the cached object
objInfo.latestObj = obj
klog.V(4).Infof("Assumed %v %q, version %v", c.description, name, newVersion)
return nil
}
func (c *assumeCache) Restore(objName string) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
objInfo, err := c.getObjInfo(objName)
if err != nil {
// This could be expected if object got deleted
klog.V(5).Infof("Restore %v %q warning: %v", c.description, objName, err)
} else {
objInfo.latestObj = objInfo.apiObj
klog.V(4).Infof("Restored %v %q", c.description, objName)
}
}
// PVAssumeCache is a AssumeCache for PersistentVolume objects
type PVAssumeCache interface {
AssumeCache
GetPV(pvName string) (*v1.PersistentVolume, error)
GetAPIPV(pvName string) (*v1.PersistentVolume, error)
ListPVs(storageClassName string) []*v1.PersistentVolume
}
type pvAssumeCache struct {
*assumeCache
}
func pvStorageClassIndexFunc(obj interface{}) ([]string, error) {
if pv, ok := obj.(*v1.PersistentVolume); ok {
return []string{pv.Spec.StorageClassName}, nil
}
return []string{""}, fmt.Errorf("object is not a v1.PersistentVolume: %v", obj)
}
func NewPVAssumeCache(informer cache.SharedIndexInformer) PVAssumeCache {
return &pvAssumeCache{assumeCache: NewAssumeCache(informer, "v1.PersistentVolume", "storageclass", pvStorageClassIndexFunc)}
}
func (c *pvAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.Get(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &errWrongType{"v1.PersistentVolume", obj}
}
return pv, nil
}
func (c *pvAssumeCache) GetAPIPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.GetAPIObj(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &errWrongType{"v1.PersistentVolume", obj}
}
return pv, nil
}
func (c *pvAssumeCache) ListPVs(storageClassName string) []*v1.PersistentVolume {
objs := c.List(&v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{
StorageClassName: storageClassName,
},
})
pvs := []*v1.PersistentVolume{}
for _, obj := range objs {
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
klog.Errorf("ListPVs: %v", &errWrongType{"v1.PersistentVolume", obj})
}
pvs = append(pvs, pv)
}
return pvs
}
// PVCAssumeCache is a AssumeCache for PersistentVolumeClaim objects
type PVCAssumeCache interface {
AssumeCache
// GetPVC returns the PVC from the cache with given pvcKey.
// pvcKey is the result of MetaNamespaceKeyFunc on PVC obj
GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error)
GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error)
}
type pvcAssumeCache struct {
*assumeCache
}
func NewPVCAssumeCache(informer cache.SharedIndexInformer) PVCAssumeCache {
return &pvcAssumeCache{assumeCache: NewAssumeCache(informer, "v1.PersistentVolumeClaim", "namespace", cache.MetaNamespaceIndexFunc)}
}
func (c *pvcAssumeCache) GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.Get(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &errWrongType{"v1.PersistentVolumeClaim", obj}
}
return pvc, nil
}
func (c *pvcAssumeCache) GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.GetAPIObj(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &errWrongType{"v1.PersistentVolumeClaim", obj}
}
return pvc, nil
}

View File

@ -0,0 +1,60 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"github.com/prometheus/client_golang/prometheus"
)
// VolumeSchedulerSubsystem - subsystem name used by scheduler
const VolumeSchedulerSubsystem = "scheduler_volume"
var (
VolumeBindingRequestSchedulerBinderCache = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "binder_cache_requests_total",
Help: "Total number for request volume binding cache",
},
[]string{"operation"},
)
VolumeSchedulingStageLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_duration_seconds",
Help: "Volume scheduling stage latency",
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
},
[]string{"operation"},
)
VolumeSchedulingStageFailed = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_stage_error_total",
Help: "Volume scheduling stage error count",
},
[]string{"operation"},
)
)
// RegisterVolumeSchedulingMetrics is used for scheduler, because the volume binding cache is a library
// used by scheduler process.
func RegisterVolumeSchedulingMetrics() {
prometheus.MustRegister(VolumeBindingRequestSchedulerBinderCache)
prometheus.MustRegister(VolumeSchedulingStageLatency)
prometheus.MustRegister(VolumeSchedulingStageFailed)
}

View File

@ -0,0 +1,792 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"sort"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/storage/etcd"
coreinformers "k8s.io/client-go/informers/core/v1"
storageinformers "k8s.io/client-go/informers/storage/v1"
clientset "k8s.io/client-go/kubernetes"
storagelisters "k8s.io/client-go/listers/storage/v1"
"k8s.io/klog"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
volumeutil "k8s.io/kubernetes/pkg/volume/util"
)
// SchedulerVolumeBinder is used by the scheduler to handle PVC/PV binding
// and dynamic provisioning. The binding decisions are integrated into the pod scheduling
// workflow so that the PV NodeAffinity is also considered along with the pod's other
// scheduling requirements.
//
// This integrates into the existing default scheduler workflow as follows:
// 1. The scheduler takes a Pod off the scheduler queue and processes it serially:
// a. Invokes all predicate functions, parallelized across nodes. FindPodVolumes() is invoked here.
// b. Invokes all priority functions. Future/TBD
// c. Selects the best node for the Pod.
// d. Cache the node selection for the Pod. AssumePodVolumes() is invoked here.
// i. If PVC binding is required, cache in-memory only:
// * For manual binding: update PV objects for prebinding to the corresponding PVCs.
// * For dynamic provisioning: update PVC object with a selected node from c)
// * For the pod, which PVCs and PVs need API updates.
// ii. Afterwards, the main scheduler caches the Pod->Node binding in the scheduler's pod cache,
// This is handled in the scheduler and not here.
// e. Asynchronously bind volumes and pod in a separate goroutine
// i. BindPodVolumes() is called first. It makes all the necessary API updates and waits for
// PV controller to fully bind and provision the PVCs. If binding fails, the Pod is sent
// back through the scheduler.
// ii. After BindPodVolumes() is complete, then the scheduler does the final Pod->Node binding.
// 2. Once all the assume operations are done in d), the scheduler processes the next Pod in the scheduler queue
// while the actual binding operation occurs in the background.
type SchedulerVolumeBinder interface {
// FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the node.
//
// If a PVC is bound, it checks if the PV's NodeAffinity matches the Node.
// Otherwise, it tries to find an available PV to bind to the PVC.
//
// It returns true if all of the Pod's PVCs have matching PVs or can be dynamic provisioned,
// and returns true if bound volumes satisfy the PV NodeAffinity.
//
// This function is called by the volume binding scheduler predicate and can be called in parallel
FindPodVolumes(pod *v1.Pod, node *v1.Node) (unboundVolumesSatisified, boundVolumesSatisfied bool, err error)
// AssumePodVolumes will:
// 1. Take the PV matches for unbound PVCs and update the PV cache assuming
// that the PV is prebound to the PVC.
// 2. Take the PVCs that need provisioning and update the PVC cache with related
// annotations set.
//
// It returns true if all volumes are fully bound
//
// This function will modify assumedPod with the node name.
// This function is called serially.
AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, err error)
// BindPodVolumes will:
// 1. Initiate the volume binding by making the API call to prebind the PV
// to its matching PVC.
// 2. Trigger the volume provisioning by making the API call to set related
// annotations on the PVC
// 3. Wait for PVCs to be completely bound by the PV controller
//
// This function can be called in parallel.
BindPodVolumes(assumedPod *v1.Pod) error
// GetBindingsCache returns the cache used (if any) to store volume binding decisions.
GetBindingsCache() PodBindingCache
}
type volumeBinder struct {
kubeClient clientset.Interface
classLister storagelisters.StorageClassLister
nodeInformer coreinformers.NodeInformer
pvcCache PVCAssumeCache
pvCache PVAssumeCache
// Stores binding decisions that were made in FindPodVolumes for use in AssumePodVolumes.
// AssumePodVolumes modifies the bindings again for use in BindPodVolumes.
podBindingCache PodBindingCache
// Amount of time to wait for the bind operation to succeed
bindTimeout time.Duration
}
// NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions.
func NewVolumeBinder(
kubeClient clientset.Interface,
nodeInformer coreinformers.NodeInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
pvInformer coreinformers.PersistentVolumeInformer,
storageClassInformer storageinformers.StorageClassInformer,
bindTimeout time.Duration) SchedulerVolumeBinder {
b := &volumeBinder{
kubeClient: kubeClient,
classLister: storageClassInformer.Lister(),
nodeInformer: nodeInformer,
pvcCache: NewPVCAssumeCache(pvcInformer.Informer()),
pvCache: NewPVAssumeCache(pvInformer.Informer()),
podBindingCache: NewPodBindingCache(),
bindTimeout: bindTimeout,
}
return b
}
func (b *volumeBinder) GetBindingsCache() PodBindingCache {
return b.podBindingCache
}
func podHasClaims(pod *v1.Pod) bool {
for _, vol := range pod.Spec.Volumes {
if vol.PersistentVolumeClaim != nil {
return true
}
}
return false
}
// FindPodVolumes caches the matching PVs and PVCs to provision per node in podBindingCache.
// This method intentionally takes in a *v1.Node object instead of using volumebinder.nodeInformer.
// That's necessary because some operations will need to pass in to the predicate fake node objects.
func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, node *v1.Node) (unboundVolumesSatisfied, boundVolumesSatisfied bool, err error) {
podName := getPodName(pod)
// Warning: Below log needs high verbosity as it can be printed several times (#60933).
klog.V(5).Infof("FindPodVolumes for pod %q, node %q", podName, node.Name)
// Initialize to true for pods that don't have volumes
unboundVolumesSatisfied = true
boundVolumesSatisfied = true
start := time.Now()
defer func() {
VolumeSchedulingStageLatency.WithLabelValues("predicate").Observe(time.Since(start).Seconds())
if err != nil {
VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc()
}
}()
if !podHasClaims(pod) {
// Fast path
return unboundVolumesSatisfied, boundVolumesSatisfied, nil
}
var (
matchedClaims []*bindingInfo
provisionedClaims []*v1.PersistentVolumeClaim
)
defer func() {
// We recreate bindings for each new schedule loop.
if len(matchedClaims) == 0 && len(provisionedClaims) == 0 {
// Clear cache if no claims to bind or provision for this node.
b.podBindingCache.ClearBindings(pod, node.Name)
return
}
// Although we do not distinguish nil from empty in this function, for
// easier testing, we normalize empty to nil.
if len(matchedClaims) == 0 {
matchedClaims = nil
}
if len(provisionedClaims) == 0 {
provisionedClaims = nil
}
// Mark cache with all matched and provisioned claims for this node
b.podBindingCache.UpdateBindings(pod, node.Name, matchedClaims, provisionedClaims)
}()
// The pod's volumes need to be processed in one call to avoid the race condition where
// volumes can get bound/provisioned in between calls.
boundClaims, claimsToBind, unboundClaimsImmediate, err := b.getPodVolumes(pod)
if err != nil {
return false, false, err
}
// Immediate claims should be bound
if len(unboundClaimsImmediate) > 0 {
return false, false, fmt.Errorf("pod has unbound immediate PersistentVolumeClaims")
}
// Check PV node affinity on bound volumes
if len(boundClaims) > 0 {
boundVolumesSatisfied, err = b.checkBoundClaims(boundClaims, node, podName)
if err != nil {
return false, false, err
}
}
// Find matching volumes and node for unbound claims
if len(claimsToBind) > 0 {
var (
claimsToFindMatching []*v1.PersistentVolumeClaim
claimsToProvision []*v1.PersistentVolumeClaim
)
// Filter out claims to provision
for _, claim := range claimsToBind {
if selectedNode, ok := claim.Annotations[annSelectedNode]; ok {
if selectedNode != node.Name {
// Fast path, skip unmatched node
return false, boundVolumesSatisfied, nil
}
claimsToProvision = append(claimsToProvision, claim)
} else {
claimsToFindMatching = append(claimsToFindMatching, claim)
}
}
// Find matching volumes
if len(claimsToFindMatching) > 0 {
var unboundClaims []*v1.PersistentVolumeClaim
unboundVolumesSatisfied, matchedClaims, unboundClaims, err = b.findMatchingVolumes(pod, claimsToFindMatching, node)
if err != nil {
return false, false, err
}
claimsToProvision = append(claimsToProvision, unboundClaims...)
}
// Check for claims to provision
if len(claimsToProvision) > 0 {
unboundVolumesSatisfied, provisionedClaims, err = b.checkVolumeProvisions(pod, claimsToProvision, node)
if err != nil {
return false, false, err
}
}
}
return unboundVolumesSatisfied, boundVolumesSatisfied, nil
}
// AssumePodVolumes will take the cached matching PVs and PVCs to provision
// in podBindingCache for the chosen node, and:
// 1. Update the pvCache with the new prebound PV.
// 2. Update the pvcCache with the new PVCs with annotations set
// 3. Update podBindingCache again with cached API updates for PVs and PVCs.
func (b *volumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (allFullyBound bool, err error) {
podName := getPodName(assumedPod)
klog.V(4).Infof("AssumePodVolumes for pod %q, node %q", podName, nodeName)
start := time.Now()
defer func() {
VolumeSchedulingStageLatency.WithLabelValues("assume").Observe(time.Since(start).Seconds())
if err != nil {
VolumeSchedulingStageFailed.WithLabelValues("assume").Inc()
}
}()
if allBound := b.arePodVolumesBound(assumedPod); allBound {
klog.V(4).Infof("AssumePodVolumes for pod %q, node %q: all PVCs bound and nothing to do", podName, nodeName)
return true, nil
}
assumedPod.Spec.NodeName = nodeName
claimsToBind := b.podBindingCache.GetBindings(assumedPod, nodeName)
claimsToProvision := b.podBindingCache.GetProvisionedPVCs(assumedPod, nodeName)
// Assume PV
newBindings := []*bindingInfo{}
for _, binding := range claimsToBind {
newPV, dirty, err := GetBindVolumeToClaim(binding.pv, binding.pvc)
klog.V(5).Infof("AssumePodVolumes: GetBindVolumeToClaim for pod %q, PV %q, PVC %q. newPV %p, dirty %v, err: %v",
podName,
binding.pv.Name,
binding.pvc.Name,
newPV,
dirty,
err)
if err != nil {
b.revertAssumedPVs(newBindings)
return false, err
}
// TODO: can we assume everytime?
if dirty {
err = b.pvCache.Assume(newPV)
if err != nil {
b.revertAssumedPVs(newBindings)
return false, err
}
}
newBindings = append(newBindings, &bindingInfo{pv: newPV, pvc: binding.pvc})
}
// Assume PVCs
newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
for _, claim := range claimsToProvision {
// The claims from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
claimClone := claim.DeepCopy()
metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, annSelectedNode, nodeName)
err = b.pvcCache.Assume(claimClone)
if err != nil {
b.revertAssumedPVs(newBindings)
b.revertAssumedPVCs(newProvisionedPVCs)
return
}
newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
}
// Update cache with the assumed pvcs and pvs
// Even if length is zero, update the cache with an empty slice to indicate that no
// operations are needed
b.podBindingCache.UpdateBindings(assumedPod, nodeName, newBindings, newProvisionedPVCs)
return
}
// BindPodVolumes gets the cached bindings and PVCs to provision in podBindingCache,
// makes the API update for those PVs/PVCs, and waits for the PVCs to be completely bound
// by the PV controller.
func (b *volumeBinder) BindPodVolumes(assumedPod *v1.Pod) (err error) {
podName := getPodName(assumedPod)
klog.V(4).Infof("BindPodVolumes for pod %q, node %q", podName, assumedPod.Spec.NodeName)
start := time.Now()
defer func() {
VolumeSchedulingStageLatency.WithLabelValues("bind").Observe(time.Since(start).Seconds())
if err != nil {
VolumeSchedulingStageFailed.WithLabelValues("bind").Inc()
}
}()
bindings := b.podBindingCache.GetBindings(assumedPod, assumedPod.Spec.NodeName)
claimsToProvision := b.podBindingCache.GetProvisionedPVCs(assumedPod, assumedPod.Spec.NodeName)
// Start API operations
err = b.bindAPIUpdate(podName, bindings, claimsToProvision)
if err != nil {
return err
}
return wait.Poll(time.Second, b.bindTimeout, func() (bool, error) {
b, err := b.checkBindings(assumedPod, bindings, claimsToProvision)
return b, err
})
}
func getPodName(pod *v1.Pod) string {
return pod.Namespace + "/" + pod.Name
}
func getPVCName(pvc *v1.PersistentVolumeClaim) string {
return pvc.Namespace + "/" + pvc.Name
}
// bindAPIUpdate gets the cached bindings and PVCs to provision in podBindingCache
// and makes the API update for those PVs/PVCs.
func (b *volumeBinder) bindAPIUpdate(podName string, bindings []*bindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) error {
if bindings == nil {
return fmt.Errorf("failed to get cached bindings for pod %q", podName)
}
if claimsToProvision == nil {
return fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
}
lastProcessedBinding := 0
lastProcessedProvisioning := 0
defer func() {
// only revert assumed cached updates for volumes we haven't successfully bound
if lastProcessedBinding < len(bindings) {
b.revertAssumedPVs(bindings[lastProcessedBinding:])
}
// only revert assumed cached updates for claims we haven't updated,
if lastProcessedProvisioning < len(claimsToProvision) {
b.revertAssumedPVCs(claimsToProvision[lastProcessedProvisioning:])
}
}()
var (
binding *bindingInfo
i int
claim *v1.PersistentVolumeClaim
)
// Do the actual prebinding. Let the PV controller take care of the rest
// There is no API rollback if the actual binding fails
for _, binding = range bindings {
klog.V(5).Infof("bindAPIUpdate: Pod %q, binding PV %q to PVC %q", podName, binding.pv.Name, binding.pvc.Name)
// TODO: does it hurt if we make an api call and nothing needs to be updated?
claimKey := claimToClaimKey(binding.pvc)
klog.V(2).Infof("claim %q bound to volume %q", claimKey, binding.pv.Name)
if newPV, err := b.kubeClient.CoreV1().PersistentVolumes().Update(binding.pv); err != nil {
klog.V(4).Infof("updating PersistentVolume[%s]: binding to %q failed: %v", binding.pv.Name, claimKey, err)
return err
} else {
klog.V(4).Infof("updating PersistentVolume[%s]: bound to %q", binding.pv.Name, claimKey)
// Save updated object from apiserver for later checking.
binding.pv = newPV
}
lastProcessedBinding++
}
// Update claims objects to trigger volume provisioning. Let the PV controller take care of the rest
// PV controller is expect to signal back by removing related annotations if actual provisioning fails
for i, claim = range claimsToProvision {
klog.V(5).Infof("bindAPIUpdate: Pod %q, PVC %q", podName, getPVCName(claim))
if newClaim, err := b.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(claim); err != nil {
return err
} else {
// Save updated object from apiserver for later checking.
claimsToProvision[i] = newClaim
}
lastProcessedProvisioning++
}
return nil
}
var (
versioner = etcd.APIObjectVersioner{}
)
// checkBindings runs through all the PVCs in the Pod and checks:
// * if the PVC is fully bound
// * if there are any conditions that require binding to fail and be retried
//
// It returns true when all of the Pod's PVCs are fully bound, and error if
// binding (and scheduling) needs to be retried
// Note that it checks on API objects not PV/PVC cache, this is because
// PV/PVC cache can be assumed again in main scheduler loop, we must check
// latest state in API server which are shared with PV controller and
// provisioners
func (b *volumeBinder) checkBindings(pod *v1.Pod, bindings []*bindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) (bool, error) {
podName := getPodName(pod)
if bindings == nil {
return false, fmt.Errorf("failed to get cached bindings for pod %q", podName)
}
if claimsToProvision == nil {
return false, fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
}
node, err := b.nodeInformer.Lister().Get(pod.Spec.NodeName)
if err != nil {
return false, fmt.Errorf("failed to get node %q: %v", pod.Spec.NodeName, err)
}
// Check for any conditions that might require scheduling retry
// When pod is removed from scheduling queue because of deletion or any
// other reasons, binding operation should be cancelled. There is no need
// to check PV/PVC bindings any more.
// We check pod binding cache here which will be cleared when pod is
// removed from scheduling queue.
if b.podBindingCache.GetDecisions(pod) == nil {
return false, fmt.Errorf("pod %q does not exist any more", podName)
}
for _, binding := range bindings {
pv, err := b.pvCache.GetAPIPV(binding.pv.Name)
if err != nil {
return false, fmt.Errorf("failed to check binding: %v", err)
}
pvc, err := b.pvcCache.GetAPIPVC(getPVCName(binding.pvc))
if err != nil {
return false, fmt.Errorf("failed to check binding: %v", err)
}
// Because we updated PV in apiserver, skip if API object is older
// and wait for new API object propagated from apiserver.
if versioner.CompareResourceVersion(binding.pv, pv) > 0 {
return false, nil
}
// Check PV's node affinity (the node might not have the proper label)
if err := volumeutil.CheckNodeAffinity(pv, node.Labels); err != nil {
return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %v", pv.Name, node.Name, err)
}
// Check if pv.ClaimRef got dropped by unbindVolume()
if pv.Spec.ClaimRef == nil || pv.Spec.ClaimRef.UID == "" {
return false, fmt.Errorf("ClaimRef got reset for pv %q", pv.Name)
}
// Check if pvc is fully bound
if !b.isPVCFullyBound(pvc) {
return false, nil
}
}
for _, claim := range claimsToProvision {
pvc, err := b.pvcCache.GetAPIPVC(getPVCName(claim))
if err != nil {
return false, fmt.Errorf("failed to check provisioning pvc: %v", err)
}
// Because we updated PVC in apiserver, skip if API object is older
// and wait for new API object propagated from apiserver.
if versioner.CompareResourceVersion(claim, pvc) > 0 {
return false, nil
}
// Check if selectedNode annotation is still set
if pvc.Annotations == nil {
return false, fmt.Errorf("selectedNode annotation reset for PVC %q", pvc.Name)
}
selectedNode := pvc.Annotations[annSelectedNode]
if selectedNode != pod.Spec.NodeName {
return false, fmt.Errorf("selectedNode annotation value %q not set to scheduled node %q", selectedNode, pod.Spec.NodeName)
}
// If the PVC is bound to a PV, check its node affinity
if pvc.Spec.VolumeName != "" {
pv, err := b.pvCache.GetAPIPV(pvc.Spec.VolumeName)
if err != nil {
if _, ok := err.(*errNotFound); ok {
// We tolerate NotFound error here, because PV is possibly
// not found because of API delay, we can check next time.
// And if PV does not exist because it's deleted, PVC will
// be unbound eventually.
return false, nil
} else {
return false, fmt.Errorf("failed to get pv %q from cache: %v", pvc.Spec.VolumeName, err)
}
}
if err := volumeutil.CheckNodeAffinity(pv, node.Labels); err != nil {
return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %v", pv.Name, node.Name, err)
}
}
// Check if pvc is fully bound
if !b.isPVCFullyBound(pvc) {
return false, nil
}
}
// All pvs and pvcs that we operated on are bound
klog.V(4).Infof("All PVCs for pod %q are bound", podName)
return true, nil
}
func (b *volumeBinder) isVolumeBound(namespace string, vol *v1.Volume) (bool, *v1.PersistentVolumeClaim, error) {
if vol.PersistentVolumeClaim == nil {
return true, nil, nil
}
pvcName := vol.PersistentVolumeClaim.ClaimName
return b.isPVCBound(namespace, pvcName)
}
func (b *volumeBinder) isPVCBound(namespace, pvcName string) (bool, *v1.PersistentVolumeClaim, error) {
claim := &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: pvcName,
Namespace: namespace,
},
}
pvcKey := getPVCName(claim)
pvc, err := b.pvcCache.GetPVC(pvcKey)
if err != nil || pvc == nil {
return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcKey, err)
}
fullyBound := b.isPVCFullyBound(pvc)
if fullyBound {
klog.V(5).Infof("PVC %q is fully bound to PV %q", pvcKey, pvc.Spec.VolumeName)
} else {
if pvc.Spec.VolumeName != "" {
klog.V(5).Infof("PVC %q is not fully bound to PV %q", pvcKey, pvc.Spec.VolumeName)
} else {
klog.V(5).Infof("PVC %q is not bound", pvcKey)
}
}
return fullyBound, pvc, nil
}
func (b *volumeBinder) isPVCFullyBound(pvc *v1.PersistentVolumeClaim) bool {
return pvc.Spec.VolumeName != "" && metav1.HasAnnotation(pvc.ObjectMeta, annBindCompleted)
}
// arePodVolumesBound returns true if all volumes are fully bound
func (b *volumeBinder) arePodVolumesBound(pod *v1.Pod) bool {
for _, vol := range pod.Spec.Volumes {
if isBound, _, _ := b.isVolumeBound(pod.Namespace, &vol); !isBound {
// Pod has at least one PVC that needs binding
return false
}
}
return true
}
// getPodVolumes returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning)
// and unbound with immediate binding (including prebound)
func (b *volumeBinder) getPodVolumes(pod *v1.Pod) (boundClaims []*v1.PersistentVolumeClaim, unboundClaims []*v1.PersistentVolumeClaim, unboundClaimsImmediate []*v1.PersistentVolumeClaim, err error) {
boundClaims = []*v1.PersistentVolumeClaim{}
unboundClaimsImmediate = []*v1.PersistentVolumeClaim{}
unboundClaims = []*v1.PersistentVolumeClaim{}
for _, vol := range pod.Spec.Volumes {
volumeBound, pvc, err := b.isVolumeBound(pod.Namespace, &vol)
if err != nil {
return nil, nil, nil, err
}
if pvc == nil {
continue
}
if volumeBound {
boundClaims = append(boundClaims, pvc)
} else {
delayBindingMode, err := IsDelayBindingMode(pvc, b.classLister)
if err != nil {
return nil, nil, nil, err
}
// Prebound PVCs are treated as unbound immediate binding
if delayBindingMode && pvc.Spec.VolumeName == "" {
// Scheduler path
unboundClaims = append(unboundClaims, pvc)
} else {
// !delayBindingMode || pvc.Spec.VolumeName != ""
// Immediate binding should have already been bound
unboundClaimsImmediate = append(unboundClaimsImmediate, pvc)
}
}
}
return boundClaims, unboundClaims, unboundClaimsImmediate, nil
}
func (b *volumeBinder) checkBoundClaims(claims []*v1.PersistentVolumeClaim, node *v1.Node, podName string) (bool, error) {
for _, pvc := range claims {
pvName := pvc.Spec.VolumeName
pv, err := b.pvCache.GetPV(pvName)
if err != nil {
return false, err
}
err = volumeutil.CheckNodeAffinity(pv, node.Labels)
if err != nil {
klog.V(4).Infof("PersistentVolume %q, Node %q mismatch for Pod %q: %v", pvName, node.Name, podName, err)
return false, nil
}
klog.V(5).Infof("PersistentVolume %q, Node %q matches for Pod %q", pvName, node.Name, podName)
}
klog.V(4).Infof("All bound volumes for Pod %q match with Node %q", podName, node.Name)
return true, nil
}
// findMatchingVolumes tries to find matching volumes for given claims,
// and return unbound claims for further provision.
func (b *volumeBinder) findMatchingVolumes(pod *v1.Pod, claimsToBind []*v1.PersistentVolumeClaim, node *v1.Node) (foundMatches bool, matchedClaims []*bindingInfo, unboundClaims []*v1.PersistentVolumeClaim, err error) {
podName := getPodName(pod)
// Sort all the claims by increasing size request to get the smallest fits
sort.Sort(byPVCSize(claimsToBind))
chosenPVs := map[string]*v1.PersistentVolume{}
foundMatches = true
matchedClaims = []*bindingInfo{}
for _, pvc := range claimsToBind {
// Get storage class name from each PVC
storageClassName := ""
storageClass := pvc.Spec.StorageClassName
if storageClass != nil {
storageClassName = *storageClass
}
allPVs := b.pvCache.ListPVs(storageClassName)
pvcName := getPVCName(pvc)
// Find a matching PV
pv, err := findMatchingVolume(pvc, allPVs, node, chosenPVs, true)
if err != nil {
return false, nil, nil, err
}
if pv == nil {
klog.V(4).Infof("No matching volumes for Pod %q, PVC %q on node %q", podName, pvcName, node.Name)
unboundClaims = append(unboundClaims, pvc)
foundMatches = false
continue
}
// matching PV needs to be excluded so we don't select it again
chosenPVs[pv.Name] = pv
matchedClaims = append(matchedClaims, &bindingInfo{pv: pv, pvc: pvc})
klog.V(5).Infof("Found matching PV %q for PVC %q on node %q for pod %q", pv.Name, pvcName, node.Name, podName)
}
if foundMatches {
klog.V(4).Infof("Found matching volumes for pod %q on node %q", podName, node.Name)
}
return
}
// checkVolumeProvisions checks given unbound claims (the claims have gone through func
// findMatchingVolumes, and do not have matching volumes for binding), and return true
// if all of the claims are eligible for dynamic provision.
func (b *volumeBinder) checkVolumeProvisions(pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied bool, provisionedClaims []*v1.PersistentVolumeClaim, err error) {
podName := getPodName(pod)
provisionedClaims = []*v1.PersistentVolumeClaim{}
for _, claim := range claimsToProvision {
pvcName := getPVCName(claim)
className := v1helper.GetPersistentVolumeClaimClass(claim)
if className == "" {
return false, nil, fmt.Errorf("no class for claim %q", pvcName)
}
class, err := b.classLister.Get(className)
if err != nil {
return false, nil, fmt.Errorf("failed to find storage class %q", className)
}
provisioner := class.Provisioner
if provisioner == "" || provisioner == notSupportedProvisioner {
klog.V(4).Infof("storage class %q of claim %q does not support dynamic provisioning", className, pvcName)
return false, nil, nil
}
// Check if the node can satisfy the topology requirement in the class
if !v1helper.MatchTopologySelectorTerms(class.AllowedTopologies, labels.Set(node.Labels)) {
klog.V(4).Infof("Node %q cannot satisfy provisioning topology requirements of claim %q", node.Name, pvcName)
return false, nil, nil
}
// TODO: Check if capacity of the node domain in the storage class
// can satisfy resource requirement of given claim
provisionedClaims = append(provisionedClaims, claim)
}
klog.V(4).Infof("Provisioning for claims of pod %q that has no matching volumes on node %q ...", podName, node.Name)
return true, provisionedClaims, nil
}
func (b *volumeBinder) revertAssumedPVs(bindings []*bindingInfo) {
for _, bindingInfo := range bindings {
b.pvCache.Restore(bindingInfo.pv.Name)
}
}
func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
for _, claim := range claims {
b.pvcCache.Restore(getPVCName(claim))
}
}
type bindingInfo struct {
// Claim that needs to be bound
pvc *v1.PersistentVolumeClaim
// Proposed PV to bind to this claim
pv *v1.PersistentVolume
}
type byPVCSize []*v1.PersistentVolumeClaim
func (a byPVCSize) Len() int {
return len(a)
}
func (a byPVCSize) Swap(i, j int) {
a[i], a[j] = a[j], a[i]
}
func (a byPVCSize) Less(i, j int) bool {
iSize := a[i].Spec.Resources.Requests[v1.ResourceStorage]
jSize := a[j].Spec.Resources.Requests[v1.ResourceStorage]
// return true if iSize is less than jSize
return iSize.Cmp(jSize) == -1
}

View File

@ -0,0 +1,165 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"sync"
"k8s.io/api/core/v1"
)
// podBindingCache stores PV binding decisions per pod per node.
// Pod entries are removed when the Pod is deleted or updated to
// no longer be schedulable.
type PodBindingCache interface {
// UpdateBindings will update the cache with the given bindings for the
// pod and node.
UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo, provisionings []*v1.PersistentVolumeClaim)
// ClearBindings will clear the cached bindings for the given pod and node.
ClearBindings(pod *v1.Pod, node string)
// GetBindings will return the cached bindings for the given pod and node.
// A nil return value means that the entry was not found. An empty slice
// means that no binding operations are needed.
GetBindings(pod *v1.Pod, node string) []*bindingInfo
// A nil return value means that the entry was not found. An empty slice
// means that no provisioning operations are needed.
GetProvisionedPVCs(pod *v1.Pod, node string) []*v1.PersistentVolumeClaim
// GetDecisions will return all cached decisions for the given pod.
GetDecisions(pod *v1.Pod) nodeDecisions
// DeleteBindings will remove all cached bindings and provisionings for the given pod.
// TODO: separate the func if it is needed to delete bindings/provisionings individually
DeleteBindings(pod *v1.Pod)
}
type podBindingCache struct {
// synchronizes bindingDecisions
rwMutex sync.RWMutex
// Key = pod name
// Value = nodeDecisions
bindingDecisions map[string]nodeDecisions
}
// Key = nodeName
// Value = bindings & provisioned PVCs of the node
type nodeDecisions map[string]nodeDecision
// A decision includes bindingInfo and provisioned PVCs of the node
type nodeDecision struct {
bindings []*bindingInfo
provisionings []*v1.PersistentVolumeClaim
}
func NewPodBindingCache() PodBindingCache {
return &podBindingCache{bindingDecisions: map[string]nodeDecisions{}}
}
func (c *podBindingCache) GetDecisions(pod *v1.Pod) nodeDecisions {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
return decisions
}
func (c *podBindingCache) DeleteBindings(pod *v1.Pod) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
if _, ok := c.bindingDecisions[podName]; ok {
delete(c.bindingDecisions, podName)
VolumeBindingRequestSchedulerBinderCache.WithLabelValues("delete").Inc()
}
}
func (c *podBindingCache) UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo, pvcs []*v1.PersistentVolumeClaim) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
decisions = nodeDecisions{}
c.bindingDecisions[podName] = decisions
}
decision, ok := decisions[node]
if !ok {
decision = nodeDecision{
bindings: bindings,
provisionings: pvcs,
}
VolumeBindingRequestSchedulerBinderCache.WithLabelValues("add").Inc()
} else {
decision.bindings = bindings
decision.provisionings = pvcs
}
decisions[node] = decision
}
func (c *podBindingCache) GetBindings(pod *v1.Pod, node string) []*bindingInfo {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
decision, ok := decisions[node]
if !ok {
return nil
}
return decision.bindings
}
func (c *podBindingCache) GetProvisionedPVCs(pod *v1.Pod, node string) []*v1.PersistentVolumeClaim {
c.rwMutex.RLock()
defer c.rwMutex.RUnlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return nil
}
decision, ok := decisions[node]
if !ok {
return nil
}
return decision.provisionings
}
func (c *podBindingCache) ClearBindings(pod *v1.Pod, node string) {
c.rwMutex.Lock()
defer c.rwMutex.Unlock()
podName := getPodName(pod)
decisions, ok := c.bindingDecisions[podName]
if !ok {
return
}
delete(decisions, node)
}

View File

@ -0,0 +1,60 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import "k8s.io/api/core/v1"
type FakeVolumeBinderConfig struct {
AllBound bool
FindUnboundSatsified bool
FindBoundSatsified bool
FindErr error
AssumeErr error
BindErr error
}
// NewVolumeBinder sets up all the caches needed for the scheduler to make
// topology-aware volume binding decisions.
func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder {
return &FakeVolumeBinder{
config: config,
}
}
type FakeVolumeBinder struct {
config *FakeVolumeBinderConfig
AssumeCalled bool
BindCalled bool
}
func (b *FakeVolumeBinder) FindPodVolumes(pod *v1.Pod, node *v1.Node) (unboundVolumesSatisfied, boundVolumesSatsified bool, err error) {
return b.config.FindUnboundSatsified, b.config.FindBoundSatsified, b.config.FindErr
}
func (b *FakeVolumeBinder) AssumePodVolumes(assumedPod *v1.Pod, nodeName string) (bool, error) {
b.AssumeCalled = true
return b.config.AllBound, b.config.AssumeErr
}
func (b *FakeVolumeBinder) BindPodVolumes(assumedPod *v1.Pod) error {
b.BindCalled = true
return b.config.BindErr
}
func (b *FakeVolumeBinder) GetBindingsCache() PodBindingCache {
return nil
}

View File

@ -0,0 +1,103 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
storagelisters "k8s.io/client-go/listers/storage/v1"
"k8s.io/client-go/tools/reference"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
)
// IsDelayBindingMode checks if claim is in delay binding mode.
func IsDelayBindingMode(claim *v1.PersistentVolumeClaim, classLister storagelisters.StorageClassLister) (bool, error) {
className := v1helper.GetPersistentVolumeClaimClass(claim)
if className == "" {
return false, nil
}
class, err := classLister.Get(className)
if err != nil {
return false, nil
}
if class.VolumeBindingMode == nil {
return false, fmt.Errorf("VolumeBindingMode not set for StorageClass %q", className)
}
return *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer, nil
}
// GetBindVolumeToClaim returns a new volume which is bound to given claim. In
// addition, it returns a bool which indicates whether we made modification on
// original volume.
func GetBindVolumeToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) (*v1.PersistentVolume, bool, error) {
dirty := false
// Check if the volume was already bound (either by user or by controller)
shouldSetBoundByController := false
if !IsVolumeBoundToClaim(volume, claim) {
shouldSetBoundByController = true
}
// The volume from method args can be pointing to watcher cache. We must not
// modify these, therefore create a copy.
volumeClone := volume.DeepCopy()
// Bind the volume to the claim if it is not bound yet
if volume.Spec.ClaimRef == nil ||
volume.Spec.ClaimRef.Name != claim.Name ||
volume.Spec.ClaimRef.Namespace != claim.Namespace ||
volume.Spec.ClaimRef.UID != claim.UID {
claimRef, err := reference.GetReference(scheme.Scheme, claim)
if err != nil {
return nil, false, fmt.Errorf("Unexpected error getting claim reference: %v", err)
}
volumeClone.Spec.ClaimRef = claimRef
dirty = true
}
// Set annBoundByController if it is not set yet
if shouldSetBoundByController && !metav1.HasAnnotation(volumeClone.ObjectMeta, annBoundByController) {
metav1.SetMetaDataAnnotation(&volumeClone.ObjectMeta, annBoundByController, "yes")
dirty = true
}
return volumeClone, dirty, nil
}
// IsVolumeBoundToClaim returns true, if given volume is pre-bound or bound
// to specific claim. Both claim.Name and claim.Namespace must be equal.
// If claim.UID is present in volume.Spec.ClaimRef, it must be equal too.
func IsVolumeBoundToClaim(volume *v1.PersistentVolume, claim *v1.PersistentVolumeClaim) bool {
if volume.Spec.ClaimRef == nil {
return false
}
if claim.Name != volume.Spec.ClaimRef.Name || claim.Namespace != volume.Spec.ClaimRef.Namespace {
return false
}
if volume.Spec.ClaimRef.UID != "" && claim.UID != volume.Spec.ClaimRef.UID {
return false
}
return true
}

View File

@ -0,0 +1,138 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package persistentvolume
import (
"fmt"
"net"
authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/util/mount"
vol "k8s.io/kubernetes/pkg/volume"
"k8s.io/kubernetes/pkg/volume/util/subpath"
)
// VolumeHost interface implementation for PersistentVolumeController.
var _ vol.VolumeHost = &PersistentVolumeController{}
func (ctrl *PersistentVolumeController) GetPluginDir(pluginName string) string {
return ""
}
func (ctrl *PersistentVolumeController) GetVolumeDevicePluginDir(pluginName string) string {
return ""
}
func (ctrl *PersistentVolumeController) GetPodsDir() string {
return ""
}
func (ctrl *PersistentVolumeController) GetPodVolumeDir(podUID types.UID, pluginName string, volumeName string) string {
return ""
}
func (ctrl *PersistentVolumeController) GetPodPluginDir(podUID types.UID, pluginName string) string {
return ""
}
func (ctrl *PersistentVolumeController) GetPodVolumeDeviceDir(ppodUID types.UID, pluginName string) string {
return ""
}
func (ctrl *PersistentVolumeController) GetKubeClient() clientset.Interface {
return ctrl.kubeClient
}
func (ctrl *PersistentVolumeController) NewWrapperMounter(volName string, spec vol.Spec, pod *v1.Pod, opts vol.VolumeOptions) (vol.Mounter, error) {
return nil, fmt.Errorf("PersistentVolumeController.NewWrapperMounter is not implemented")
}
func (ctrl *PersistentVolumeController) NewWrapperUnmounter(volName string, spec vol.Spec, podUID types.UID) (vol.Unmounter, error) {
return nil, fmt.Errorf("PersistentVolumeController.NewWrapperMounter is not implemented")
}
func (ctrl *PersistentVolumeController) GetCloudProvider() cloudprovider.Interface {
return ctrl.cloud
}
func (ctrl *PersistentVolumeController) GetMounter(pluginName string) mount.Interface {
return nil
}
func (ctrl *PersistentVolumeController) GetHostName() string {
return ""
}
func (ctrl *PersistentVolumeController) GetHostIP() (net.IP, error) {
return nil, fmt.Errorf("PersistentVolumeController.GetHostIP() is not implemented")
}
func (ctrl *PersistentVolumeController) GetNodeAllocatable() (v1.ResourceList, error) {
return v1.ResourceList{}, nil
}
func (ctrl *PersistentVolumeController) GetSecretFunc() func(namespace, name string) (*v1.Secret, error) {
return func(_, _ string) (*v1.Secret, error) {
return nil, fmt.Errorf("GetSecret unsupported in PersistentVolumeController")
}
}
func (ctrl *PersistentVolumeController) GetConfigMapFunc() func(namespace, name string) (*v1.ConfigMap, error) {
return func(_, _ string) (*v1.ConfigMap, error) {
return nil, fmt.Errorf("GetConfigMap unsupported in PersistentVolumeController")
}
}
func (ctrl *PersistentVolumeController) GetServiceAccountTokenFunc() func(_, _ string, _ *authenticationv1.TokenRequest) (*authenticationv1.TokenRequest, error) {
return func(_, _ string, _ *authenticationv1.TokenRequest) (*authenticationv1.TokenRequest, error) {
return nil, fmt.Errorf("GetServiceAccountToken unsupported in PersistentVolumeController")
}
}
func (ctrl *PersistentVolumeController) DeleteServiceAccountTokenFunc() func(types.UID) {
return func(types.UID) {
klog.Errorf("DeleteServiceAccountToken unsupported in PersistentVolumeController")
}
}
func (adc *PersistentVolumeController) GetExec(pluginName string) mount.Exec {
return mount.NewOsExec()
}
func (ctrl *PersistentVolumeController) GetNodeLabels() (map[string]string, error) {
return nil, fmt.Errorf("GetNodeLabels() unsupported in PersistentVolumeController")
}
func (ctrl *PersistentVolumeController) GetNodeName() types.NodeName {
return ""
}
func (ctrl *PersistentVolumeController) GetEventRecorder() record.EventRecorder {
return ctrl.eventRecorder
}
func (ctrl *PersistentVolumeController) GetSubpather() subpath.Interface {
// No volume plugin needs Subpaths in PV controller.
return nil
}