rebase: bump k8s.io/kubernetes from 1.26.2 to 1.27.2

Bumps [k8s.io/kubernetes](https://github.com/kubernetes/kubernetes) from 1.26.2 to 1.27.2.
- [Release notes](https://github.com/kubernetes/kubernetes/releases)
- [Commits](https://github.com/kubernetes/kubernetes/compare/v1.26.2...v1.27.2)

---
updated-dependencies:
- dependency-name: k8s.io/kubernetes
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
This commit is contained in:
dependabot[bot]
2023-05-29 21:03:29 +00:00
committed by mergify[bot]
parent 0e79135419
commit 07b05616a0
1072 changed files with 208716 additions and 198880 deletions

21
vendor/k8s.io/apiserver/pkg/storage/OWNERS generated vendored Normal file
View File

@ -0,0 +1,21 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- lavalamp
- liggitt
- wojtek-t
reviewers:
- lavalamp
- smarterclayton
- wojtek-t
- deads2k
- caesarxuchao
- mikedanese
- liggitt
- ncdc
- ingvagabund
- enj
- stevekuznetsov
emeritus_approvers:
- xiang90
- timothysc

View File

@ -0,0 +1,130 @@
/*
Copyright 2014 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"fmt"
"strconv"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/validation/field"
)
// APIObjectVersioner implements versioning and extracting etcd node information
// for objects that have an embedded ObjectMeta or ListMeta field.
type APIObjectVersioner struct{}
// UpdateObject implements Versioner
func (a APIObjectVersioner) UpdateObject(obj runtime.Object, resourceVersion uint64) error {
accessor, err := meta.Accessor(obj)
if err != nil {
return err
}
versionString := ""
if resourceVersion != 0 {
versionString = strconv.FormatUint(resourceVersion, 10)
}
accessor.SetResourceVersion(versionString)
return nil
}
// UpdateList implements Versioner
func (a APIObjectVersioner) UpdateList(obj runtime.Object, resourceVersion uint64, nextKey string, count *int64) error {
if resourceVersion == 0 {
return fmt.Errorf("illegal resource version from storage: %d", resourceVersion)
}
listAccessor, err := meta.ListAccessor(obj)
if err != nil || listAccessor == nil {
return err
}
versionString := strconv.FormatUint(resourceVersion, 10)
listAccessor.SetResourceVersion(versionString)
listAccessor.SetContinue(nextKey)
listAccessor.SetRemainingItemCount(count)
return nil
}
// PrepareObjectForStorage clears resourceVersion and selfLink prior to writing to etcd.
func (a APIObjectVersioner) PrepareObjectForStorage(obj runtime.Object) error {
accessor, err := meta.Accessor(obj)
if err != nil {
return err
}
accessor.SetResourceVersion("")
accessor.SetSelfLink("")
return nil
}
// ObjectResourceVersion implements Versioner
func (a APIObjectVersioner) ObjectResourceVersion(obj runtime.Object) (uint64, error) {
accessor, err := meta.Accessor(obj)
if err != nil {
return 0, err
}
version := accessor.GetResourceVersion()
if len(version) == 0 {
return 0, nil
}
return strconv.ParseUint(version, 10, 64)
}
// ParseResourceVersion takes a resource version argument and converts it to
// the etcd version. For watch we should pass to helper.Watch(). Because resourceVersion is
// an opaque value, the default watch behavior for non-zero watch is to watch
// the next value (if you pass "1", you will see updates from "2" onwards).
func (a APIObjectVersioner) ParseResourceVersion(resourceVersion string) (uint64, error) {
if resourceVersion == "" || resourceVersion == "0" {
return 0, nil
}
version, err := strconv.ParseUint(resourceVersion, 10, 64)
if err != nil {
return 0, NewInvalidError(field.ErrorList{
// Validation errors are supposed to return version-specific field
// paths, but this is probably close enough.
field.Invalid(field.NewPath("resourceVersion"), resourceVersion, err.Error()),
})
}
return version, nil
}
// Versioner implements Versioner
var _ Versioner = APIObjectVersioner{}
// CompareResourceVersion compares etcd resource versions. Outside this API they are all strings,
// but etcd resource versions are special, they're actually ints, so we can easily compare them.
func (a APIObjectVersioner) CompareResourceVersion(lhs, rhs runtime.Object) int {
lhsVersion, err := a.ObjectResourceVersion(lhs)
if err != nil {
// coder error
panic(err)
}
rhsVersion, err := a.ObjectResourceVersion(rhs)
if err != nil {
// coder error
panic(err)
}
if lhsVersion == rhsVersion {
return 0
}
if lhsVersion < rhsVersion {
return -1
}
return 1
}

1519
vendor/k8s.io/apiserver/pkg/storage/cacher/cacher.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,408 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"bytes"
"fmt"
"io"
"reflect"
"runtime/debug"
"sync"
"sync/atomic"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
)
var _ runtime.CacheableObject = &cachingObject{}
// metaRuntimeInterface implements runtime.Object and
// metav1.Object interfaces.
type metaRuntimeInterface interface {
runtime.Object
metav1.Object
}
// serializationResult captures a result of serialization.
type serializationResult struct {
// once should be used to ensure serialization is computed once.
once sync.Once
// raw is serialized object.
raw []byte
// err is error from serialization.
err error
}
// serializationsCache is a type for caching serialization results.
type serializationsCache map[runtime.Identifier]*serializationResult
// cachingObject is an object that is able to cache its serializations
// so that each of those is computed exactly once.
//
// cachingObject implements the metav1.Object interface (accessors for
// all metadata fields).
type cachingObject struct {
lock sync.RWMutex
// deepCopied defines whether the object below has already been
// deep copied. The operation is performed lazily on the first
// setXxx operation.
//
// The lazy deep-copy make is useful, as effectively the only
// case when we are setting some fields are ResourceVersion for
// DELETE events, so in all other cases we can effectively avoid
// performing any deep copies.
deepCopied bool
// Object for which serializations are cached.
object metaRuntimeInterface
// serializations is a cache containing object`s serializations.
// The value stored in atomic.Value is of type serializationsCache.
// The atomic.Value type is used to allow fast-path.
serializations atomic.Value
}
// newCachingObject performs a deep copy of the given object and wraps it
// into a cachingObject.
// An error is returned if it's not possible to cast the object to
// metav1.Object type.
func newCachingObject(object runtime.Object) (*cachingObject, error) {
if obj, ok := object.(metaRuntimeInterface); ok {
result := &cachingObject{
object: obj,
deepCopied: false,
}
result.serializations.Store(make(serializationsCache))
return result, nil
}
return nil, fmt.Errorf("can't cast object to metav1.Object: %#v", object)
}
func (o *cachingObject) getSerializationResult(id runtime.Identifier) *serializationResult {
// Fast-path for getting from cache.
serializations := o.serializations.Load().(serializationsCache)
if result, exists := serializations[id]; exists {
return result
}
// Slow-path (that may require insert).
o.lock.Lock()
defer o.lock.Unlock()
serializations = o.serializations.Load().(serializationsCache)
// Check if in the meantime it wasn't inserted.
if result, exists := serializations[id]; exists {
return result
}
// Insert an entry for <id>. This requires copy of existing map.
newSerializations := make(serializationsCache)
for k, v := range serializations {
newSerializations[k] = v
}
result := &serializationResult{}
newSerializations[id] = result
o.serializations.Store(newSerializations)
return result
}
// CacheEncode implements runtime.CacheableObject interface.
// It serializes the object and writes the result to given io.Writer trying
// to first use the already cached result and falls back to a given encode
// function in case of cache miss.
// It assumes that for a given identifier, the encode function always encodes
// each input object into the same output format.
func (o *cachingObject) CacheEncode(id runtime.Identifier, encode func(runtime.Object, io.Writer) error, w io.Writer) error {
result := o.getSerializationResult(id)
result.once.Do(func() {
buffer := bytes.NewBuffer(nil)
// TODO(wojtek-t): This is currently making a copy to avoid races
// in cases where encoding is making subtle object modifications,
// e.g. #82497
// Figure out if we can somehow avoid this under some conditions.
result.err = encode(o.GetObject(), buffer)
result.raw = buffer.Bytes()
})
// Once invoked, fields of serialization will not change.
if result.err != nil {
return result.err
}
_, err := w.Write(result.raw)
return err
}
// GetObject implements runtime.CacheableObject interface.
// It returns deep-copy of the wrapped object to return ownership of it
// to the called according to the contract of the interface.
func (o *cachingObject) GetObject() runtime.Object {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.DeepCopyObject().(metaRuntimeInterface)
}
// GetObjectKind implements runtime.Object interface.
func (o *cachingObject) GetObjectKind() schema.ObjectKind {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetObjectKind()
}
// DeepCopyObject implements runtime.Object interface.
func (o *cachingObject) DeepCopyObject() runtime.Object {
// DeepCopyObject on cachingObject is not expected to be called anywhere.
// However, to be on the safe-side, we implement it, though given the
// cache is only an optimization we ignore copying it.
result := &cachingObject{
deepCopied: true,
}
result.serializations.Store(make(serializationsCache))
o.lock.RLock()
defer o.lock.RUnlock()
result.object = o.object.DeepCopyObject().(metaRuntimeInterface)
return result
}
var (
invalidationCacheTimestampLock sync.Mutex
invalidationCacheTimestamp time.Time
)
// shouldLogCacheInvalidation allows for logging cache-invalidation
// at most once per second (to avoid spamming logs in case of issues).
func shouldLogCacheInvalidation(now time.Time) bool {
invalidationCacheTimestampLock.Lock()
defer invalidationCacheTimestampLock.Unlock()
if invalidationCacheTimestamp.Add(time.Second).Before(now) {
invalidationCacheTimestamp = now
return true
}
return false
}
func (o *cachingObject) invalidateCacheLocked() {
if cache, ok := o.serializations.Load().(serializationsCache); ok && len(cache) == 0 {
return
}
// We don't expect cache invalidation to happen - so we want
// to log the stacktrace to allow debugging if that will happen.
// OTOH, we don't want to spam logs with it.
// So we try to log it at most once per second.
if shouldLogCacheInvalidation(time.Now()) {
klog.Warningf("Unexpected cache invalidation for %#v\n%s", o.object, string(debug.Stack()))
}
o.serializations.Store(make(serializationsCache))
}
// The following functions implement metav1.Object interface:
// - getters simply delegate for the underlying object
// - setters check if operations isn't noop and if so,
// invalidate the cache and delegate for the underlying object
func (o *cachingObject) conditionalSet(isNoop func() bool, set func()) {
if fastPath := func() bool {
o.lock.RLock()
defer o.lock.RUnlock()
return isNoop()
}(); fastPath {
return
}
o.lock.Lock()
defer o.lock.Unlock()
if isNoop() {
return
}
if !o.deepCopied {
o.object = o.object.DeepCopyObject().(metaRuntimeInterface)
o.deepCopied = true
}
o.invalidateCacheLocked()
set()
}
func (o *cachingObject) GetNamespace() string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetNamespace()
}
func (o *cachingObject) SetNamespace(namespace string) {
o.conditionalSet(
func() bool { return o.object.GetNamespace() == namespace },
func() { o.object.SetNamespace(namespace) },
)
}
func (o *cachingObject) GetName() string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetName()
}
func (o *cachingObject) SetName(name string) {
o.conditionalSet(
func() bool { return o.object.GetName() == name },
func() { o.object.SetName(name) },
)
}
func (o *cachingObject) GetGenerateName() string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetGenerateName()
}
func (o *cachingObject) SetGenerateName(name string) {
o.conditionalSet(
func() bool { return o.object.GetGenerateName() == name },
func() { o.object.SetGenerateName(name) },
)
}
func (o *cachingObject) GetUID() types.UID {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetUID()
}
func (o *cachingObject) SetUID(uid types.UID) {
o.conditionalSet(
func() bool { return o.object.GetUID() == uid },
func() { o.object.SetUID(uid) },
)
}
func (o *cachingObject) GetResourceVersion() string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetResourceVersion()
}
func (o *cachingObject) SetResourceVersion(version string) {
o.conditionalSet(
func() bool { return o.object.GetResourceVersion() == version },
func() { o.object.SetResourceVersion(version) },
)
}
func (o *cachingObject) GetGeneration() int64 {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetGeneration()
}
func (o *cachingObject) SetGeneration(generation int64) {
o.conditionalSet(
func() bool { return o.object.GetGeneration() == generation },
func() { o.object.SetGeneration(generation) },
)
}
func (o *cachingObject) GetSelfLink() string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetSelfLink()
}
func (o *cachingObject) SetSelfLink(selfLink string) {
o.conditionalSet(
func() bool { return o.object.GetSelfLink() == selfLink },
func() { o.object.SetSelfLink(selfLink) },
)
}
func (o *cachingObject) GetCreationTimestamp() metav1.Time {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetCreationTimestamp()
}
func (o *cachingObject) SetCreationTimestamp(timestamp metav1.Time) {
o.conditionalSet(
func() bool { return o.object.GetCreationTimestamp() == timestamp },
func() { o.object.SetCreationTimestamp(timestamp) },
)
}
func (o *cachingObject) GetDeletionTimestamp() *metav1.Time {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetDeletionTimestamp()
}
func (o *cachingObject) SetDeletionTimestamp(timestamp *metav1.Time) {
o.conditionalSet(
func() bool { return o.object.GetDeletionTimestamp() == timestamp },
func() { o.object.SetDeletionTimestamp(timestamp) },
)
}
func (o *cachingObject) GetDeletionGracePeriodSeconds() *int64 {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetDeletionGracePeriodSeconds()
}
func (o *cachingObject) SetDeletionGracePeriodSeconds(gracePeriodSeconds *int64) {
o.conditionalSet(
func() bool { return o.object.GetDeletionGracePeriodSeconds() == gracePeriodSeconds },
func() { o.object.SetDeletionGracePeriodSeconds(gracePeriodSeconds) },
)
}
func (o *cachingObject) GetLabels() map[string]string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetLabels()
}
func (o *cachingObject) SetLabels(labels map[string]string) {
o.conditionalSet(
func() bool { return reflect.DeepEqual(o.object.GetLabels(), labels) },
func() { o.object.SetLabels(labels) },
)
}
func (o *cachingObject) GetAnnotations() map[string]string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetAnnotations()
}
func (o *cachingObject) SetAnnotations(annotations map[string]string) {
o.conditionalSet(
func() bool { return reflect.DeepEqual(o.object.GetAnnotations(), annotations) },
func() { o.object.SetAnnotations(annotations) },
)
}
func (o *cachingObject) GetFinalizers() []string {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetFinalizers()
}
func (o *cachingObject) SetFinalizers(finalizers []string) {
o.conditionalSet(
func() bool { return reflect.DeepEqual(o.object.GetFinalizers(), finalizers) },
func() { o.object.SetFinalizers(finalizers) },
)
}
func (o *cachingObject) GetOwnerReferences() []metav1.OwnerReference {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetOwnerReferences()
}
func (o *cachingObject) SetOwnerReferences(references []metav1.OwnerReference) {
o.conditionalSet(
func() bool { return reflect.DeepEqual(o.object.GetOwnerReferences(), references) },
func() { o.object.SetOwnerReferences(references) },
)
}
func (o *cachingObject) GetManagedFields() []metav1.ManagedFieldsEntry {
o.lock.RLock()
defer o.lock.RUnlock()
return o.object.GetManagedFields()
}
func (o *cachingObject) SetManagedFields(managedFields []metav1.ManagedFieldsEntry) {
o.conditionalSet(
func() bool { return reflect.DeepEqual(o.object.GetManagedFields(), managedFields) },
func() { o.object.SetManagedFields(managedFields) },
)
}

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-instrumentation-approvers
reviewers:
- sig-instrumentation-reviewers
labels:
- sig/instrumentation

View File

@ -0,0 +1,174 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
const (
namespace = "apiserver"
subsystem = "watch_cache"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
listCacheCount = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Name: "cache_list_total",
Help: "Number of LIST requests served from watch cache",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource_prefix", "index"},
)
listCacheNumFetched = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Name: "cache_list_fetched_objects_total",
Help: "Number of objects read from watch cache in the course of serving a LIST request",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource_prefix", "index"},
)
listCacheNumReturned = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Name: "cache_list_returned_objects_total",
Help: "Number of objects returned for a LIST request from watch cache",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource_prefix"},
)
InitCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Name: "init_events_total",
Help: "Counter of init events processed in watch cache broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
EventsCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "events_dispatched_total",
Help: "Counter of events dispatched in watch cache broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
TerminatedWatchersCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Name: "terminated_watchers_total",
Help: "Counter of watchers closed due to unresponsiveness broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
watchCacheCapacityIncreaseTotal = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Subsystem: subsystem,
Name: "capacity_increase_total",
Help: "Total number of watch cache capacity increase events broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
watchCacheCapacityDecreaseTotal = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Subsystem: subsystem,
Name: "capacity_decrease_total",
Help: "Total number of watch cache capacity decrease events broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
WatchCacheCapacity = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Subsystem: subsystem,
Name: "capacity",
Help: "Total capacity of watch cache broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
WatchCacheInitializations = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "initializations_total",
Help: "Counter of watch cache initializations broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
)
var registerMetrics sync.Once
// Register all metrics.
func Register() {
// Register the metrics.
registerMetrics.Do(func() {
legacyregistry.MustRegister(listCacheCount)
legacyregistry.MustRegister(listCacheNumFetched)
legacyregistry.MustRegister(listCacheNumReturned)
legacyregistry.MustRegister(InitCounter)
legacyregistry.MustRegister(EventsCounter)
legacyregistry.MustRegister(TerminatedWatchersCounter)
legacyregistry.MustRegister(watchCacheCapacityIncreaseTotal)
legacyregistry.MustRegister(watchCacheCapacityDecreaseTotal)
legacyregistry.MustRegister(WatchCacheCapacity)
legacyregistry.MustRegister(WatchCacheInitializations)
})
}
// RecordListCacheMetrics notes various metrics of the cost to serve a LIST request
func RecordListCacheMetrics(resourcePrefix, indexName string, numFetched, numReturned int) {
listCacheCount.WithLabelValues(resourcePrefix, indexName).Inc()
listCacheNumFetched.WithLabelValues(resourcePrefix, indexName).Add(float64(numFetched))
listCacheNumReturned.WithLabelValues(resourcePrefix).Add(float64(numReturned))
}
// RecordsWatchCacheCapacityChange record watchCache capacity resize(increase or decrease) operations.
func RecordsWatchCacheCapacityChange(objType string, old, new int) {
WatchCacheCapacity.WithLabelValues(objType).Set(float64(new))
if old < new {
WatchCacheCapacity.WithLabelValues(objType).Inc()
return
}
watchCacheCapacityDecreaseTotal.WithLabelValues(objType).Inc()
}

96
vendor/k8s.io/apiserver/pkg/storage/cacher/ready.go generated vendored Normal file
View File

@ -0,0 +1,96 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"fmt"
"sync"
)
type status int
const (
Pending status = iota
Ready
Stopped
)
// ready is a three state condition variable that blocks until is Ready if is not Stopped.
// Its initial state is Pending.
type ready struct {
state status
c *sync.Cond
}
func newReady() *ready {
return &ready{
c: sync.NewCond(&sync.RWMutex{}),
state: Pending,
}
}
// wait blocks until it is Ready or Stopped, it returns an error if is Stopped.
func (r *ready) wait() error {
r.c.L.Lock()
defer r.c.L.Unlock()
for r.state == Pending {
r.c.Wait()
}
switch r.state {
case Ready:
return nil
case Stopped:
return fmt.Errorf("apiserver cacher is stopped")
default:
return fmt.Errorf("unexpected apiserver cache state: %v", r.state)
}
}
// check returns true only if it is Ready.
func (r *ready) check() bool {
// TODO: Make check() function more sophisticated, in particular
// allow it to behave as "waitWithTimeout".
rwMutex := r.c.L.(*sync.RWMutex)
rwMutex.RLock()
defer rwMutex.RUnlock()
return r.state == Ready
}
// set the state to Pending (false) or Ready (true), it does not have effect if the state is Stopped.
func (r *ready) set(ok bool) {
r.c.L.Lock()
defer r.c.L.Unlock()
if r.state == Stopped {
return
}
if ok {
r.state = Ready
} else {
r.state = Pending
}
r.c.Broadcast()
}
// stop the condition variable and set it as Stopped. This state is irreversible.
func (r *ready) stop() {
r.c.L.Lock()
defer r.c.L.Unlock()
if r.state != Stopped {
r.state = Stopped
r.c.Broadcast()
}
}

View File

@ -0,0 +1,102 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"sync"
"time"
"k8s.io/utils/clock"
)
const (
refreshPerSecond = 50 * time.Millisecond
maxBudget = 100 * time.Millisecond
)
// timeBudget implements a budget of time that you can use and is
// periodically being refreshed. The pattern to use it is:
//
// budget := newTimeBudget(...)
// ...
// timeout := budget.takeAvailable()
// // Now you can spend at most timeout on doing stuff
// ...
// // If you didn't use all timeout, return what you didn't use
// budget.returnUnused(<unused part of timeout>)
//
// NOTE: It's not recommended to be used concurrently from multiple threads -
// if first user takes the whole timeout, the second one will get 0 timeout
// even though the first one may return something later.
type timeBudget interface {
takeAvailable() time.Duration
returnUnused(unused time.Duration)
}
type timeBudgetImpl struct {
sync.Mutex
clock clock.Clock
budget time.Duration
maxBudget time.Duration
refresh time.Duration
// last store last access time
last time.Time
}
func newTimeBudget() timeBudget {
result := &timeBudgetImpl{
clock: clock.RealClock{},
budget: time.Duration(0),
refresh: refreshPerSecond,
maxBudget: maxBudget,
}
result.last = result.clock.Now()
return result
}
func (t *timeBudgetImpl) takeAvailable() time.Duration {
t.Lock()
defer t.Unlock()
// budget accumulated since last access
now := t.clock.Now()
acc := now.Sub(t.last).Seconds() * t.refresh.Seconds()
if acc < 0 {
acc = 0
}
// update current budget and store the current time
if t.budget = t.budget + time.Duration(acc*1e9); t.budget > t.maxBudget {
t.budget = t.maxBudget
}
t.last = now
result := t.budget
t.budget = time.Duration(0)
return result
}
func (t *timeBudgetImpl) returnUnused(unused time.Duration) {
t.Lock()
defer t.Unlock()
if unused < 0 {
// We used more than allowed.
return
}
// add the unused time directly to the budget
// takeAvailable() will take into account the elapsed time
if t.budget = t.budget + unused; t.budget > t.maxBudget {
t.budget = t.maxBudget
}
}

60
vendor/k8s.io/apiserver/pkg/storage/cacher/util.go generated vendored Normal file
View File

@ -0,0 +1,60 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"strings"
)
// hasPathPrefix returns true if the string matches pathPrefix exactly, or if is prefixed with pathPrefix at a path segment boundary
func hasPathPrefix(s, pathPrefix string) bool {
// Short circuit if s doesn't contain the prefix at all
if !strings.HasPrefix(s, pathPrefix) {
return false
}
pathPrefixLength := len(pathPrefix)
if len(s) == pathPrefixLength {
// Exact match
return true
}
if strings.HasSuffix(pathPrefix, "/") {
// pathPrefix already ensured a path segment boundary
return true
}
if s[pathPrefixLength:pathPrefixLength+1] == "/" {
// The next character in s is a path segment boundary
// Check this instead of normalizing pathPrefix to avoid allocating on every call
return true
}
return false
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a < b {
return a
}
return b
}

View File

@ -0,0 +1,688 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
"fmt"
"math"
"sort"
"sync"
"time"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/cacher/metrics"
"k8s.io/client-go/tools/cache"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
const (
// blockTimeout determines how long we're willing to block the request
// to wait for a given resource version to be propagated to cache,
// before terminating request and returning Timeout error with retry
// after suggestion.
blockTimeout = 3 * time.Second
// resourceVersionTooHighRetrySeconds is the seconds before a operation should be retried by the client
// after receiving a 'too high resource version' error.
resourceVersionTooHighRetrySeconds = 1
// eventFreshDuration is time duration of events we want to keep.
// We set it to `defaultBookmarkFrequency` plus epsilon to maximize
// chances that last bookmark was sent within kept history, at the
// same time, minimizing the needed memory usage.
eventFreshDuration = 75 * time.Second
// defaultLowerBoundCapacity is a default value for event cache capacity's lower bound.
// TODO: Figure out, to what value we can decreased it.
defaultLowerBoundCapacity = 100
// defaultUpperBoundCapacity should be able to keep eventFreshDuration of history.
defaultUpperBoundCapacity = 100 * 1024
)
// watchCacheEvent is a single "watch event" that is send to users of
// watchCache. Additionally to a typical "watch.Event" it contains
// the previous value of the object to enable proper filtering in the
// upper layers.
type watchCacheEvent struct {
Type watch.EventType
Object runtime.Object
ObjLabels labels.Set
ObjFields fields.Set
PrevObject runtime.Object
PrevObjLabels labels.Set
PrevObjFields fields.Set
Key string
ResourceVersion uint64
RecordTime time.Time
}
// Computing a key of an object is generally non-trivial (it performs
// e.g. validation underneath). Similarly computing object fields and
// labels. To avoid computing them multiple times (to serve the event
// in different List/Watch requests), in the underlying store we are
// keeping structs (key, object, labels, fields).
type storeElement struct {
Key string
Object runtime.Object
Labels labels.Set
Fields fields.Set
}
func storeElementKey(obj interface{}) (string, error) {
elem, ok := obj.(*storeElement)
if !ok {
return "", fmt.Errorf("not a storeElement: %v", obj)
}
return elem.Key, nil
}
func storeElementObject(obj interface{}) (runtime.Object, error) {
elem, ok := obj.(*storeElement)
if !ok {
return nil, fmt.Errorf("not a storeElement: %v", obj)
}
return elem.Object, nil
}
func storeElementIndexFunc(objIndexFunc cache.IndexFunc) cache.IndexFunc {
return func(obj interface{}) (strings []string, e error) {
seo, err := storeElementObject(obj)
if err != nil {
return nil, err
}
return objIndexFunc(seo)
}
}
func storeElementIndexers(indexers *cache.Indexers) cache.Indexers {
if indexers == nil {
return cache.Indexers{}
}
ret := cache.Indexers{}
for indexName, indexFunc := range *indexers {
ret[indexName] = storeElementIndexFunc(indexFunc)
}
return ret
}
// watchCache implements a Store interface.
// However, it depends on the elements implementing runtime.Object interface.
//
// watchCache is a "sliding window" (with a limited capacity) of objects
// observed from a watch.
type watchCache struct {
sync.RWMutex
// Condition on which lists are waiting for the fresh enough
// resource version.
cond *sync.Cond
// Maximum size of history window.
capacity int
// upper bound of capacity since event cache has a dynamic size.
upperBoundCapacity int
// lower bound of capacity since event cache has a dynamic size.
lowerBoundCapacity int
// keyFunc is used to get a key in the underlying storage for a given object.
keyFunc func(runtime.Object) (string, error)
// getAttrsFunc is used to get labels and fields of an object.
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error)
// cache is used a cyclic buffer - its first element (with the smallest
// resourceVersion) is defined by startIndex, its last element is defined
// by endIndex (if cache is full it will be startIndex + capacity).
// Both startIndex and endIndex can be greater than buffer capacity -
// you should always apply modulo capacity to get an index in cache array.
cache []*watchCacheEvent
startIndex int
endIndex int
// store will effectively support LIST operation from the "end of cache
// history" i.e. from the moment just after the newest cached watched event.
// It is necessary to effectively allow clients to start watching at now.
// NOTE: We assume that <store> is thread-safe.
store cache.Indexer
// ResourceVersion up to which the watchCache is propagated.
resourceVersion uint64
// ResourceVersion of the last list result (populated via Replace() method).
listResourceVersion uint64
// This handler is run at the end of every successful Replace() method.
onReplace func()
// This handler is run at the end of every Add/Update/Delete method
// and additionally gets the previous value of the object.
eventHandler func(*watchCacheEvent)
// for testing timeouts.
clock clock.Clock
// An underlying storage.Versioner.
versioner storage.Versioner
// cacher's group resource
groupResource schema.GroupResource
// For testing cache interval invalidation.
indexValidator indexValidator
}
func newWatchCache(
keyFunc func(runtime.Object) (string, error),
eventHandler func(*watchCacheEvent),
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error),
versioner storage.Versioner,
indexers *cache.Indexers,
clock clock.Clock,
groupResource schema.GroupResource) *watchCache {
wc := &watchCache{
capacity: defaultLowerBoundCapacity,
keyFunc: keyFunc,
getAttrsFunc: getAttrsFunc,
cache: make([]*watchCacheEvent, defaultLowerBoundCapacity),
lowerBoundCapacity: defaultLowerBoundCapacity,
upperBoundCapacity: defaultUpperBoundCapacity,
startIndex: 0,
endIndex: 0,
store: cache.NewIndexer(storeElementKey, storeElementIndexers(indexers)),
resourceVersion: 0,
listResourceVersion: 0,
eventHandler: eventHandler,
clock: clock,
versioner: versioner,
groupResource: groupResource,
}
metrics.WatchCacheCapacity.WithLabelValues(groupResource.String()).Set(float64(wc.capacity))
wc.cond = sync.NewCond(wc.RLocker())
wc.indexValidator = wc.isIndexValidLocked
return wc
}
// Add takes runtime.Object as an argument.
func (w *watchCache) Add(obj interface{}) error {
object, resourceVersion, err := w.objectToVersionedRuntimeObject(obj)
if err != nil {
return err
}
event := watch.Event{Type: watch.Added, Object: object}
f := func(elem *storeElement) error { return w.store.Add(elem) }
return w.processEvent(event, resourceVersion, f)
}
// Update takes runtime.Object as an argument.
func (w *watchCache) Update(obj interface{}) error {
object, resourceVersion, err := w.objectToVersionedRuntimeObject(obj)
if err != nil {
return err
}
event := watch.Event{Type: watch.Modified, Object: object}
f := func(elem *storeElement) error { return w.store.Update(elem) }
return w.processEvent(event, resourceVersion, f)
}
// Delete takes runtime.Object as an argument.
func (w *watchCache) Delete(obj interface{}) error {
object, resourceVersion, err := w.objectToVersionedRuntimeObject(obj)
if err != nil {
return err
}
event := watch.Event{Type: watch.Deleted, Object: object}
f := func(elem *storeElement) error { return w.store.Delete(elem) }
return w.processEvent(event, resourceVersion, f)
}
func (w *watchCache) objectToVersionedRuntimeObject(obj interface{}) (runtime.Object, uint64, error) {
object, ok := obj.(runtime.Object)
if !ok {
return nil, 0, fmt.Errorf("obj does not implement runtime.Object interface: %v", obj)
}
resourceVersion, err := w.versioner.ObjectResourceVersion(object)
if err != nil {
return nil, 0, err
}
return object, resourceVersion, nil
}
// processEvent is safe as long as there is at most one call to it in flight
// at any point in time.
func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, updateFunc func(*storeElement) error) error {
key, err := w.keyFunc(event.Object)
if err != nil {
return fmt.Errorf("couldn't compute key: %v", err)
}
elem := &storeElement{Key: key, Object: event.Object}
elem.Labels, elem.Fields, err = w.getAttrsFunc(event.Object)
if err != nil {
return err
}
wcEvent := &watchCacheEvent{
Type: event.Type,
Object: elem.Object,
ObjLabels: elem.Labels,
ObjFields: elem.Fields,
Key: key,
ResourceVersion: resourceVersion,
RecordTime: w.clock.Now(),
}
if err := func() error {
// TODO: We should consider moving this lock below after the watchCacheEvent
// is created. In such situation, the only problematic scenario is Replace(
// happening after getting object from store and before acquiring a lock.
// Maybe introduce another lock for this purpose.
w.Lock()
defer w.Unlock()
previous, exists, err := w.store.Get(elem)
if err != nil {
return err
}
if exists {
previousElem := previous.(*storeElement)
wcEvent.PrevObject = previousElem.Object
wcEvent.PrevObjLabels = previousElem.Labels
wcEvent.PrevObjFields = previousElem.Fields
}
w.updateCache(wcEvent)
w.resourceVersion = resourceVersion
defer w.cond.Broadcast()
return updateFunc(elem)
}(); err != nil {
return err
}
// Avoid calling event handler under lock.
// This is safe as long as there is at most one call to Add/Update/Delete and
// UpdateResourceVersion in flight at any point in time, which is true now,
// because reflector calls them synchronously from its main thread.
if w.eventHandler != nil {
w.eventHandler(wcEvent)
}
return nil
}
// Assumes that lock is already held for write.
func (w *watchCache) updateCache(event *watchCacheEvent) {
w.resizeCacheLocked(event.RecordTime)
if w.isCacheFullLocked() {
// Cache is full - remove the oldest element.
w.startIndex++
}
w.cache[w.endIndex%w.capacity] = event
w.endIndex++
}
// resizeCacheLocked resizes the cache if necessary:
// - increases capacity by 2x if cache is full and all cached events occurred within last eventFreshDuration.
// - decreases capacity by 2x when recent quarter of events occurred outside of eventFreshDuration(protect watchCache from flapping).
func (w *watchCache) resizeCacheLocked(eventTime time.Time) {
if w.isCacheFullLocked() && eventTime.Sub(w.cache[w.startIndex%w.capacity].RecordTime) < eventFreshDuration {
capacity := min(w.capacity*2, w.upperBoundCapacity)
if capacity > w.capacity {
w.doCacheResizeLocked(capacity)
}
return
}
if w.isCacheFullLocked() && eventTime.Sub(w.cache[(w.endIndex-w.capacity/4)%w.capacity].RecordTime) > eventFreshDuration {
capacity := max(w.capacity/2, w.lowerBoundCapacity)
if capacity < w.capacity {
w.doCacheResizeLocked(capacity)
}
return
}
}
// isCacheFullLocked used to judge whether watchCacheEvent is full.
// Assumes that lock is already held for write.
func (w *watchCache) isCacheFullLocked() bool {
return w.endIndex == w.startIndex+w.capacity
}
// doCacheResizeLocked resize watchCache's event array with different capacity.
// Assumes that lock is already held for write.
func (w *watchCache) doCacheResizeLocked(capacity int) {
newCache := make([]*watchCacheEvent, capacity)
if capacity < w.capacity {
// adjust startIndex if cache capacity shrink.
w.startIndex = w.endIndex - capacity
}
for i := w.startIndex; i < w.endIndex; i++ {
newCache[i%capacity] = w.cache[i%w.capacity]
}
w.cache = newCache
metrics.RecordsWatchCacheCapacityChange(w.groupResource.String(), w.capacity, capacity)
w.capacity = capacity
}
func (w *watchCache) UpdateResourceVersion(resourceVersion string) {
rv, err := w.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
klog.Errorf("Couldn't parse resourceVersion: %v", err)
return
}
func() {
w.Lock()
defer w.Unlock()
w.resourceVersion = rv
}()
// Avoid calling event handler under lock.
// This is safe as long as there is at most one call to Add/Update/Delete and
// UpdateResourceVersion in flight at any point in time, which is true now,
// because reflector calls them synchronously from its main thread.
if w.eventHandler != nil {
wcEvent := &watchCacheEvent{
Type: watch.Bookmark,
ResourceVersion: rv,
}
w.eventHandler(wcEvent)
}
}
// List returns list of pointers to <storeElement> objects.
func (w *watchCache) List() []interface{} {
return w.store.List()
}
// waitUntilFreshAndBlock waits until cache is at least as fresh as given <resourceVersion>.
// NOTE: This function acquired lock and doesn't release it.
// You HAVE TO explicitly call w.RUnlock() after this function.
func (w *watchCache) waitUntilFreshAndBlock(ctx context.Context, resourceVersion uint64) error {
startTime := w.clock.Now()
// In case resourceVersion is 0, we accept arbitrarily stale result.
// As a result, the condition in the below for loop will never be
// satisfied (w.resourceVersion is never negative), this call will
// never hit the w.cond.Wait().
// As a result - we can optimize the code by not firing the wakeup
// function (and avoid starting a gorotuine), especially given that
// resourceVersion=0 is the most common case.
if resourceVersion > 0 {
go func() {
// Wake us up when the time limit has expired. The docs
// promise that time.After (well, NewTimer, which it calls)
// will wait *at least* the duration given. Since this go
// routine starts sometime after we record the start time, and
// it will wake up the loop below sometime after the broadcast,
// we don't need to worry about waking it up before the time
// has expired accidentally.
<-w.clock.After(blockTimeout)
w.cond.Broadcast()
}()
}
w.RLock()
span := tracing.SpanFromContext(ctx)
span.AddEvent("watchCache locked acquired")
for w.resourceVersion < resourceVersion {
if w.clock.Since(startTime) >= blockTimeout {
// Request that the client retry after 'resourceVersionTooHighRetrySeconds' seconds.
return storage.NewTooLargeResourceVersionError(resourceVersion, w.resourceVersion, resourceVersionTooHighRetrySeconds)
}
w.cond.Wait()
}
span.AddEvent("watchCache fresh enough")
return nil
}
// WaitUntilFreshAndList returns list of pointers to `storeElement` objects along
// with their ResourceVersion and the name of the index, if any, that was used.
func (w *watchCache) WaitUntilFreshAndList(ctx context.Context, resourceVersion uint64, matchValues []storage.MatchValue) ([]interface{}, uint64, string, error) {
err := w.waitUntilFreshAndBlock(ctx, resourceVersion)
defer w.RUnlock()
if err != nil {
return nil, 0, "", err
}
// This isn't the place where we do "final filtering" - only some "prefiltering" is happening here. So the only
// requirement here is to NOT miss anything that should be returned. We can return as many non-matching items as we
// want - they will be filtered out later. The fact that we return less things is only further performance improvement.
// TODO: if multiple indexes match, return the one with the fewest items, so as to do as much filtering as possible.
for _, matchValue := range matchValues {
if result, err := w.store.ByIndex(matchValue.IndexName, matchValue.Value); err == nil {
return result, w.resourceVersion, matchValue.IndexName, nil
}
}
return w.store.List(), w.resourceVersion, "", nil
}
// WaitUntilFreshAndGet returns a pointers to <storeElement> object.
func (w *watchCache) WaitUntilFreshAndGet(ctx context.Context, resourceVersion uint64, key string) (interface{}, bool, uint64, error) {
err := w.waitUntilFreshAndBlock(ctx, resourceVersion)
defer w.RUnlock()
if err != nil {
return nil, false, 0, err
}
value, exists, err := w.store.GetByKey(key)
return value, exists, w.resourceVersion, err
}
func (w *watchCache) ListKeys() []string {
return w.store.ListKeys()
}
// Get takes runtime.Object as a parameter. However, it returns
// pointer to <storeElement>.
func (w *watchCache) Get(obj interface{}) (interface{}, bool, error) {
object, ok := obj.(runtime.Object)
if !ok {
return nil, false, fmt.Errorf("obj does not implement runtime.Object interface: %v", obj)
}
key, err := w.keyFunc(object)
if err != nil {
return nil, false, fmt.Errorf("couldn't compute key: %v", err)
}
return w.store.Get(&storeElement{Key: key, Object: object})
}
// GetByKey returns pointer to <storeElement>.
func (w *watchCache) GetByKey(key string) (interface{}, bool, error) {
return w.store.GetByKey(key)
}
// Replace takes slice of runtime.Object as a parameter.
func (w *watchCache) Replace(objs []interface{}, resourceVersion string) error {
version, err := w.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return err
}
toReplace := make([]interface{}, 0, len(objs))
for _, obj := range objs {
object, ok := obj.(runtime.Object)
if !ok {
return fmt.Errorf("didn't get runtime.Object for replace: %#v", obj)
}
key, err := w.keyFunc(object)
if err != nil {
return fmt.Errorf("couldn't compute key: %v", err)
}
objLabels, objFields, err := w.getAttrsFunc(object)
if err != nil {
return err
}
toReplace = append(toReplace, &storeElement{
Key: key,
Object: object,
Labels: objLabels,
Fields: objFields,
})
}
w.Lock()
defer w.Unlock()
w.startIndex = 0
w.endIndex = 0
if err := w.store.Replace(toReplace, resourceVersion); err != nil {
return err
}
w.listResourceVersion = version
w.resourceVersion = version
if w.onReplace != nil {
w.onReplace()
}
w.cond.Broadcast()
klog.V(3).Infof("Replace watchCache (rev: %v) ", resourceVersion)
return nil
}
func (w *watchCache) SetOnReplace(onReplace func()) {
w.Lock()
defer w.Unlock()
w.onReplace = onReplace
}
func (w *watchCache) Resync() error {
// Nothing to do
return nil
}
func (w *watchCache) currentCapacity() int {
w.Lock()
defer w.Unlock()
return w.capacity
}
const (
// minWatchChanSize is the min size of channels used by the watch.
// We keep that set to 10 for "backward compatibility" until we
// convince ourselves based on some metrics that decreasing is safe.
minWatchChanSize = 10
// maxWatchChanSizeWithIndexAndTriger is the max size of the channel
// used by the watch using the index and trigger selector.
maxWatchChanSizeWithIndexAndTrigger = 10
// maxWatchChanSizeWithIndexWithoutTrigger is the max size of the channel
// used by the watch using the index but without triggering selector.
// We keep that set to 1000 for "backward compatibility", until we
// convinced ourselves based on some metrics that decreasing is safe.
maxWatchChanSizeWithIndexWithoutTrigger = 1000
// maxWatchChanSizeWithoutIndex is the max size of the channel
// used by the watch not using the index.
// TODO(wojtek-t): Figure out if the value shouldn't be higher.
maxWatchChanSizeWithoutIndex = 100
)
func (w *watchCache) suggestedWatchChannelSize(indexExists, triggerUsed bool) int {
// To estimate the channel size we use a heuristic that a channel
// should roughly be able to keep one second of history.
// We don't have an exact data, but given we store updates from
// the last <eventFreshDuration>, we approach it by dividing the
// capacity by the length of the history window.
chanSize := int(math.Ceil(float64(w.currentCapacity()) / eventFreshDuration.Seconds()))
// Finally we adjust the size to avoid ending with too low or
// to large values.
if chanSize < minWatchChanSize {
chanSize = minWatchChanSize
}
var maxChanSize int
switch {
case indexExists && triggerUsed:
maxChanSize = maxWatchChanSizeWithIndexAndTrigger
case indexExists && !triggerUsed:
maxChanSize = maxWatchChanSizeWithIndexWithoutTrigger
case !indexExists:
maxChanSize = maxWatchChanSizeWithoutIndex
}
if chanSize > maxChanSize {
chanSize = maxChanSize
}
return chanSize
}
// isIndexValidLocked checks if a given index is still valid.
// This assumes that the lock is held.
func (w *watchCache) isIndexValidLocked(index int) bool {
return index >= w.startIndex
}
// getAllEventsSinceLocked returns a watchCacheInterval that can be used to
// retrieve events since a certain resourceVersion. This function assumes to
// be called under the watchCache lock.
func (w *watchCache) getAllEventsSinceLocked(resourceVersion uint64) (*watchCacheInterval, error) {
size := w.endIndex - w.startIndex
var oldest uint64
switch {
case w.listResourceVersion > 0 && w.startIndex == 0:
// If no event was removed from the buffer since last relist, the oldest watch
// event we can deliver is one greater than the resource version of the list.
oldest = w.listResourceVersion + 1
case size > 0:
// If the previous condition is not satisfied: either some event was already
// removed from the buffer or we've never completed a list (the latter can
// only happen in unit tests that populate the buffer without performing
// list/replace operations), the oldest watch event we can deliver is the first
// one in the buffer.
oldest = w.cache[w.startIndex%w.capacity].ResourceVersion
default:
return nil, fmt.Errorf("watch cache isn't correctly initialized")
}
if resourceVersion == 0 {
// resourceVersion = 0 means that we don't require any specific starting point
// and we would like to start watching from ~now.
// However, to keep backward compatibility, we additionally need to return the
// current state and only then start watching from that point.
//
// TODO: In v2 api, we should stop returning the current state - #13969.
ci, err := newCacheIntervalFromStore(w.resourceVersion, w.store, w.getAttrsFunc)
if err != nil {
return nil, err
}
return ci, nil
}
if resourceVersion < oldest-1 {
return nil, errors.NewResourceExpired(fmt.Sprintf("too old resource version: %d (%d)", resourceVersion, oldest-1))
}
// Binary search the smallest index at which resourceVersion is greater than the given one.
f := func(i int) bool {
return w.cache[(w.startIndex+i)%w.capacity].ResourceVersion > resourceVersion
}
first := sort.Search(size, f)
indexerFunc := func(i int) *watchCacheEvent {
return w.cache[i%w.capacity]
}
ci := newCacheInterval(w.startIndex+first, w.endIndex, indexerFunc, w.indexValidator, &w.RWMutex)
return ci, nil
}

View File

@ -0,0 +1,226 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"fmt"
"sync"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/tools/cache"
)
// watchCacheInterval serves as an abstraction over a source
// of watchCacheEvents. It maintains a window of events over
// an underlying source and these events can be served using
// the exposed Next() API. The main intent for doing things
// this way is to introduce an upper bound of memory usage
// for starting a watch and reduce the maximum possible time
// interval for which the lock would be held while events are
// copied over.
//
// The source of events for the interval is typically either
// the watchCache circular buffer, if events being retrieved
// need to be for resource versions > 0 or the underlying
// implementation of Store, if resource version = 0.
//
// Furthermore, an interval can be either valid or invalid at
// any given point of time. The notion of validity makes sense
// only in cases where the window of events in the underlying
// source can change over time - i.e. for watchCache circular
// buffer. When the circular buffer is full and an event needs
// to be popped off, watchCache::startIndex is incremented. In
// this case, an interval tracking that popped event is valid
// only if it has already been copied to its internal buffer.
// However, for efficiency we perform that lazily and we mark
// an interval as invalid iff we need to copy events from the
// watchCache and we end up needing events that have already
// been popped off. This translates to the following condition:
//
// watchCacheInterval::startIndex >= watchCache::startIndex.
//
// When this condition becomes false, the interval is no longer
// valid and should not be used to retrieve and serve elements
// from the underlying source.
type watchCacheInterval struct {
// startIndex denotes the starting point of the interval
// being considered. The value is the index in the actual
// source of watchCacheEvents. If the source of events is
// the watchCache, then this must be used modulo capacity.
startIndex int
// endIndex denotes the ending point of the interval being
// considered. The value is the index in the actual source
// of events. If the source of the events is the watchCache,
// then this should be used modulo capacity.
endIndex int
// indexer is meant to inject behaviour for how an event must
// be retrieved from the underlying source given an index.
indexer indexerFunc
// indexValidator is used to check if a given index is still
// valid perspective. If it is deemed that the index is not
// valid, then this interval can no longer be used to serve
// events. Use of indexValidator is warranted only in cases
// where the window of events in the underlying source can
// change over time. Furthermore, an interval is invalid if
// its startIndex no longer coincides with the startIndex of
// underlying source.
indexValidator indexValidator
// buffer holds watchCacheEvents that this interval returns on
// a call to Next(). This exists mainly to reduce acquiring the
// lock on each invocation of Next().
buffer *watchCacheIntervalBuffer
// lock effectively protects access to the underlying source
// of events through - indexer and indexValidator.
//
// Given that indexer and indexValidator only read state, if
// possible, Locker obtained through RLocker() is provided.
lock sync.Locker
}
type attrFunc func(runtime.Object) (labels.Set, fields.Set, error)
type indexerFunc func(int) *watchCacheEvent
type indexValidator func(int) bool
func newCacheInterval(startIndex, endIndex int, indexer indexerFunc, indexValidator indexValidator, locker sync.Locker) *watchCacheInterval {
return &watchCacheInterval{
startIndex: startIndex,
endIndex: endIndex,
indexer: indexer,
indexValidator: indexValidator,
buffer: &watchCacheIntervalBuffer{buffer: make([]*watchCacheEvent, bufferSize)},
lock: locker,
}
}
// newCacheIntervalFromStore is meant to handle the case of rv=0, such that the events
// returned by Next() need to be events from a List() done on the underlying store of
// the watch cache.
func newCacheIntervalFromStore(resourceVersion uint64, store cache.Indexer, getAttrsFunc attrFunc) (*watchCacheInterval, error) {
buffer := &watchCacheIntervalBuffer{}
allItems := store.List()
buffer.buffer = make([]*watchCacheEvent, len(allItems))
for i, item := range allItems {
elem, ok := item.(*storeElement)
if !ok {
return nil, fmt.Errorf("not a storeElement: %v", elem)
}
objLabels, objFields, err := getAttrsFunc(elem.Object)
if err != nil {
return nil, err
}
buffer.buffer[i] = &watchCacheEvent{
Type: watch.Added,
Object: elem.Object,
ObjLabels: objLabels,
ObjFields: objFields,
Key: elem.Key,
ResourceVersion: resourceVersion,
}
buffer.endIndex++
}
ci := &watchCacheInterval{
startIndex: 0,
// Simulate that we already have all the events we're looking for.
endIndex: 0,
buffer: buffer,
}
return ci, nil
}
// Next returns the next item in the cache interval provided the cache
// interval is still valid. An error is returned if the interval is
// invalidated.
func (wci *watchCacheInterval) Next() (*watchCacheEvent, error) {
// if there are items in the buffer to return, return from
// the buffer.
if event, exists := wci.buffer.next(); exists {
return event, nil
}
// check if there are still other events in this interval
// that can be processed.
if wci.startIndex >= wci.endIndex {
return nil, nil
}
wci.lock.Lock()
defer wci.lock.Unlock()
if valid := wci.indexValidator(wci.startIndex); !valid {
return nil, fmt.Errorf("cache interval invalidated, interval startIndex: %d", wci.startIndex)
}
wci.fillBuffer()
if event, exists := wci.buffer.next(); exists {
return event, nil
}
return nil, nil
}
func (wci *watchCacheInterval) fillBuffer() {
wci.buffer.startIndex = 0
wci.buffer.endIndex = 0
for wci.startIndex < wci.endIndex && !wci.buffer.isFull() {
event := wci.indexer(wci.startIndex)
if event == nil {
break
}
wci.buffer.buffer[wci.buffer.endIndex] = event
wci.buffer.endIndex++
wci.startIndex++
}
}
const bufferSize = 100
// watchCacheIntervalBuffer is used to reduce acquiring
// the lock on each invocation of watchCacheInterval.Next().
type watchCacheIntervalBuffer struct {
// buffer is used to hold watchCacheEvents that
// the interval returns on a call to Next().
buffer []*watchCacheEvent
// The first element of buffer is defined by startIndex,
// its last element is defined by endIndex.
startIndex int
endIndex int
}
// next returns the next event present in the interval buffer provided
// it is not empty.
func (wcib *watchCacheIntervalBuffer) next() (*watchCacheEvent, bool) {
if wcib.isEmpty() {
return nil, false
}
next := wcib.buffer[wcib.startIndex]
wcib.startIndex++
return next, true
}
func (wcib *watchCacheIntervalBuffer) isFull() bool {
return wcib.endIndex >= bufferSize
}
func (wcib *watchCacheIntervalBuffer) isEmpty() bool {
return wcib.startIndex == wcib.endIndex
}

93
vendor/k8s.io/apiserver/pkg/storage/continue.go generated vendored Normal file
View File

@ -0,0 +1,93 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"path"
"strings"
)
var (
ErrInvalidStartRV = errors.New("continue key is not valid: incorrect encoded start resourceVersion (version meta.k8s.io/v1)")
ErrEmptyStartKey = errors.New("continue key is not valid: encoded start key empty (version meta.k8s.io/v1)")
ErrGenericInvalidKey = errors.New("continue key is not valid")
ErrUnrecognizedEncodedVersion = errors.New("continue key is not valid: server does not recognize this encoded version")
)
// continueToken is a simple structured object for encoding the state of a continue token.
// TODO: if we change the version of the encoded from, we can't start encoding the new version
// until all other servers are upgraded (i.e. we need to support rolling schema)
// This is a public API struct and cannot change.
type continueToken struct {
APIVersion string `json:"v"`
ResourceVersion int64 `json:"rv"`
StartKey string `json:"start"`
}
// DecodeContinue transforms an encoded predicate from into a versioned struct.
// TODO: return a typed error that instructs clients that they must relist
func DecodeContinue(continueValue, keyPrefix string) (fromKey string, rv int64, err error) {
data, err := base64.RawURLEncoding.DecodeString(continueValue)
if err != nil {
return "", 0, fmt.Errorf("%w: %v", ErrGenericInvalidKey, err)
}
var c continueToken
if err := json.Unmarshal(data, &c); err != nil {
return "", 0, fmt.Errorf("%w: %v", ErrGenericInvalidKey, err)
}
switch c.APIVersion {
case "meta.k8s.io/v1":
if c.ResourceVersion == 0 {
return "", 0, ErrInvalidStartRV
}
if len(c.StartKey) == 0 {
return "", 0, ErrEmptyStartKey
}
// defend against path traversal attacks by clients - path.Clean will ensure that startKey cannot
// be at a higher level of the hierarchy, and so when we append the key prefix we will end up with
// continue start key that is fully qualified and cannot range over anything less specific than
// keyPrefix.
key := c.StartKey
if !strings.HasPrefix(key, "/") {
key = "/" + key
}
cleaned := path.Clean(key)
if cleaned != key {
return "", 0, fmt.Errorf("%w: %v", ErrGenericInvalidKey, c.StartKey)
}
return keyPrefix + cleaned[1:], c.ResourceVersion, nil
default:
return "", 0, fmt.Errorf("%w %v", ErrUnrecognizedEncodedVersion, c.APIVersion)
}
}
// EncodeContinue returns a string representing the encoded continuation of the current query.
func EncodeContinue(key, keyPrefix string, resourceVersion int64) (string, error) {
nextKey := strings.TrimPrefix(key, keyPrefix)
if nextKey == key {
return "", fmt.Errorf("unable to encode next field: the key and key prefix do not match")
}
out, err := json.Marshal(&continueToken{APIVersion: "meta.k8s.io/v1", ResourceVersion: resourceVersion, StartKey: nextKey})
if err != nil {
return "", err
}
return base64.RawURLEncoding.EncodeToString(out), nil
}

18
vendor/k8s.io/apiserver/pkg/storage/doc.go generated vendored Normal file
View File

@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Interfaces for database-related operations.
package storage // import "k8s.io/apiserver/pkg/storage"

195
vendor/k8s.io/apiserver/pkg/storage/errors.go generated vendored Normal file
View File

@ -0,0 +1,195 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"fmt"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation/field"
)
const (
ErrCodeKeyNotFound int = iota + 1
ErrCodeKeyExists
ErrCodeResourceVersionConflicts
ErrCodeInvalidObj
ErrCodeUnreachable
)
var errCodeToMessage = map[int]string{
ErrCodeKeyNotFound: "key not found",
ErrCodeKeyExists: "key exists",
ErrCodeResourceVersionConflicts: "resource version conflicts",
ErrCodeInvalidObj: "invalid object",
ErrCodeUnreachable: "server unreachable",
}
func NewKeyNotFoundError(key string, rv int64) *StorageError {
return &StorageError{
Code: ErrCodeKeyNotFound,
Key: key,
ResourceVersion: rv,
}
}
func NewKeyExistsError(key string, rv int64) *StorageError {
return &StorageError{
Code: ErrCodeKeyExists,
Key: key,
ResourceVersion: rv,
}
}
func NewResourceVersionConflictsError(key string, rv int64) *StorageError {
return &StorageError{
Code: ErrCodeResourceVersionConflicts,
Key: key,
ResourceVersion: rv,
}
}
func NewUnreachableError(key string, rv int64) *StorageError {
return &StorageError{
Code: ErrCodeUnreachable,
Key: key,
ResourceVersion: rv,
}
}
func NewInvalidObjError(key, msg string) *StorageError {
return &StorageError{
Code: ErrCodeInvalidObj,
Key: key,
AdditionalErrorMsg: msg,
}
}
type StorageError struct {
Code int
Key string
ResourceVersion int64
AdditionalErrorMsg string
}
func (e *StorageError) Error() string {
return fmt.Sprintf("StorageError: %s, Code: %d, Key: %s, ResourceVersion: %d, AdditionalErrorMsg: %s",
errCodeToMessage[e.Code], e.Code, e.Key, e.ResourceVersion, e.AdditionalErrorMsg)
}
// IsNotFound returns true if and only if err is "key" not found error.
func IsNotFound(err error) bool {
return isErrCode(err, ErrCodeKeyNotFound)
}
// IsExist returns true if and only if err is "key" already exists error.
func IsExist(err error) bool {
return isErrCode(err, ErrCodeKeyExists)
}
// IsUnreachable returns true if and only if err indicates the server could not be reached.
func IsUnreachable(err error) bool {
return isErrCode(err, ErrCodeUnreachable)
}
// IsConflict returns true if and only if err is a write conflict.
func IsConflict(err error) bool {
return isErrCode(err, ErrCodeResourceVersionConflicts)
}
// IsInvalidObj returns true if and only if err is invalid error
func IsInvalidObj(err error) bool {
return isErrCode(err, ErrCodeInvalidObj)
}
func isErrCode(err error, code int) bool {
if err == nil {
return false
}
if e, ok := err.(*StorageError); ok {
return e.Code == code
}
return false
}
// InvalidError is generated when an error caused by invalid API object occurs
// in the storage package.
type InvalidError struct {
Errs field.ErrorList
}
func (e InvalidError) Error() string {
return e.Errs.ToAggregate().Error()
}
// IsInvalidError returns true if and only if err is an InvalidError.
func IsInvalidError(err error) bool {
_, ok := err.(InvalidError)
return ok
}
func NewInvalidError(errors field.ErrorList) InvalidError {
return InvalidError{errors}
}
// InternalError is generated when an error occurs in the storage package, i.e.,
// not from the underlying storage backend (e.g., etcd).
type InternalError struct {
Reason string
}
func (e InternalError) Error() string {
return e.Reason
}
// IsInternalError returns true if and only if err is an InternalError.
func IsInternalError(err error) bool {
_, ok := err.(InternalError)
return ok
}
func NewInternalError(reason string) InternalError {
return InternalError{reason}
}
func NewInternalErrorf(format string, a ...interface{}) InternalError {
return InternalError{fmt.Sprintf(format, a...)}
}
var tooLargeResourceVersionCauseMsg = "Too large resource version"
// NewTooLargeResourceVersionError returns a timeout error with the given retrySeconds for a request for
// a minimum resource version that is larger than the largest currently available resource version for a requested resource.
func NewTooLargeResourceVersionError(minimumResourceVersion, currentRevision uint64, retrySeconds int) error {
err := errors.NewTimeoutError(fmt.Sprintf("Too large resource version: %d, current: %d", minimumResourceVersion, currentRevision), retrySeconds)
err.ErrStatus.Details.Causes = []metav1.StatusCause{
{
Type: metav1.CauseTypeResourceVersionTooLarge,
Message: tooLargeResourceVersionCauseMsg,
},
}
return err
}
// IsTooLargeResourceVersion returns true if the error is a TooLargeResourceVersion error.
func IsTooLargeResourceVersion(err error) bool {
if !errors.IsTimeout(err) {
return false
}
return errors.HasStatusCause(err, metav1.CauseTypeResourceVersionTooLarge)
}

18
vendor/k8s.io/apiserver/pkg/storage/errors/doc.go generated vendored Normal file
View File

@ -0,0 +1,18 @@
/*
Copyright 2014 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package storage provides conversion of storage errors to API errors.
package storage // import "k8s.io/apiserver/pkg/storage/errors"

116
vendor/k8s.io/apiserver/pkg/storage/errors/storage.go generated vendored Normal file
View File

@ -0,0 +1,116 @@
/*
Copyright 2014 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apiserver/pkg/storage"
)
// InterpretListError converts a generic error on a retrieval
// operation into the appropriate API error.
func InterpretListError(err error, qualifiedResource schema.GroupResource) error {
switch {
case storage.IsNotFound(err):
return errors.NewNotFound(qualifiedResource, "")
case storage.IsUnreachable(err):
return errors.NewServerTimeout(qualifiedResource, "list", 2) // TODO: make configurable or handled at a higher level
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}
// InterpretGetError converts a generic error on a retrieval
// operation into the appropriate API error.
func InterpretGetError(err error, qualifiedResource schema.GroupResource, name string) error {
switch {
case storage.IsNotFound(err):
return errors.NewNotFound(qualifiedResource, name)
case storage.IsUnreachable(err):
return errors.NewServerTimeout(qualifiedResource, "get", 2) // TODO: make configurable or handled at a higher level
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}
// InterpretCreateError converts a generic error on a create
// operation into the appropriate API error.
func InterpretCreateError(err error, qualifiedResource schema.GroupResource, name string) error {
switch {
case storage.IsExist(err):
return errors.NewAlreadyExists(qualifiedResource, name)
case storage.IsUnreachable(err):
return errors.NewServerTimeout(qualifiedResource, "create", 2) // TODO: make configurable or handled at a higher level
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}
// InterpretUpdateError converts a generic error on an update
// operation into the appropriate API error.
func InterpretUpdateError(err error, qualifiedResource schema.GroupResource, name string) error {
switch {
case storage.IsConflict(err), storage.IsExist(err), storage.IsInvalidObj(err):
return errors.NewConflict(qualifiedResource, name, err)
case storage.IsUnreachable(err):
return errors.NewServerTimeout(qualifiedResource, "update", 2) // TODO: make configurable or handled at a higher level
case storage.IsNotFound(err):
return errors.NewNotFound(qualifiedResource, name)
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}
// InterpretDeleteError converts a generic error on a delete
// operation into the appropriate API error.
func InterpretDeleteError(err error, qualifiedResource schema.GroupResource, name string) error {
switch {
case storage.IsNotFound(err):
return errors.NewNotFound(qualifiedResource, name)
case storage.IsUnreachable(err):
return errors.NewServerTimeout(qualifiedResource, "delete", 2) // TODO: make configurable or handled at a higher level
case storage.IsConflict(err), storage.IsExist(err), storage.IsInvalidObj(err):
return errors.NewConflict(qualifiedResource, name, err)
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}
// InterpretWatchError converts a generic error on a watch
// operation into the appropriate API error.
func InterpretWatchError(err error, resource schema.GroupResource, name string) error {
switch {
case storage.IsInvalidError(err):
invalidError, _ := err.(storage.InvalidError)
return errors.NewInvalid(schema.GroupKind{Group: resource.Group, Kind: resource.Resource}, name, invalidError.Errs)
case storage.IsInternalError(err):
return errors.NewInternalError(err)
default:
return err
}
}

4
vendor/k8s.io/apiserver/pkg/storage/etcd3/OWNERS generated vendored Normal file
View File

@ -0,0 +1,4 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- wojtek-t

162
vendor/k8s.io/apiserver/pkg/storage/etcd3/compact.go generated vendored Normal file
View File

@ -0,0 +1,162 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"context"
"strconv"
"sync"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"k8s.io/klog/v2"
)
const (
compactRevKey = "compact_rev_key"
)
var (
endpointsMapMu sync.Mutex
endpointsMap map[string]struct{}
)
func init() {
endpointsMap = make(map[string]struct{})
}
// StartCompactor starts a compactor in the background to compact old version of keys that's not needed.
// By default, we save the most recent 5 minutes data and compact versions > 5minutes ago.
// It should be enough for slow watchers and to tolerate burst.
// TODO: We might keep a longer history (12h) in the future once storage API can take advantage of past version of keys.
func StartCompactor(ctx context.Context, client *clientv3.Client, compactInterval time.Duration) {
endpointsMapMu.Lock()
defer endpointsMapMu.Unlock()
// In one process, we can have only one compactor for one cluster.
// Currently we rely on endpoints to differentiate clusters.
for _, ep := range client.Endpoints() {
if _, ok := endpointsMap[ep]; ok {
klog.V(4).Infof("compactor already exists for endpoints %v", client.Endpoints())
return
}
}
for _, ep := range client.Endpoints() {
endpointsMap[ep] = struct{}{}
}
if compactInterval != 0 {
go compactor(ctx, client, compactInterval)
}
}
// compactor periodically compacts historical versions of keys in etcd.
// It will compact keys with versions older than given interval.
// In other words, after compaction, it will only contain keys set during last interval.
// Any API call for the older versions of keys will return error.
// Interval is the time interval between each compaction. The first compaction happens after "interval".
func compactor(ctx context.Context, client *clientv3.Client, interval time.Duration) {
// Technical definitions:
// We have a special key in etcd defined as *compactRevKey*.
// compactRevKey's value will be set to the string of last compacted revision.
// compactRevKey's version will be used as logical time for comparison. THe version is referred as compact time.
// Initially, because the key doesn't exist, the compact time (version) is 0.
//
// Algorithm:
// - Compare to see if (local compact_time) = (remote compact_time).
// - If yes, increment both local and remote compact_time, and do a compaction.
// - If not, set local to remote compact_time.
//
// Technical details/insights:
//
// The protocol here is lease based. If one compactor CAS successfully, the others would know it when they fail in
// CAS later and would try again in 5 minutes. If an APIServer crashed, another one would "take over" the lease.
//
// For example, in the following diagram, we have a compactor C1 doing compaction in t1, t2. Another compactor C2
// at t1' (t1 < t1' < t2) would CAS fail, set its known oldRev to rev at t1', and try again in t2' (t2' > t2).
// If C1 crashed and wouldn't compact at t2, C2 would CAS successfully at t2'.
//
// oldRev(t2) curRev(t2)
// +
// oldRev curRev |
// + + |
// | | |
// | | t1' | t2'
// +---v-------------v----^---------v------^---->
// t0 t1 t2
//
// We have the guarantees:
// - in normal cases, the interval is 5 minutes.
// - in failover, the interval is >5m and <10m
//
// FAQ:
// - What if time is not accurate? We don't care as long as someone did the compaction. Atomicity is ensured using
// etcd API.
// - What happened under heavy load scenarios? Initially, each apiserver will do only one compaction
// every 5 minutes. This is very unlikely affecting or affected w.r.t. server load.
var compactTime int64
var rev int64
var err error
for {
select {
case <-time.After(interval):
case <-ctx.Done():
return
}
compactTime, rev, err = compact(ctx, client, compactTime, rev)
if err != nil {
klog.Errorf("etcd: endpoint (%v) compact failed: %v", client.Endpoints(), err)
continue
}
}
}
// compact compacts etcd store and returns current rev.
// It will return the current compact time and global revision if no error occurred.
// Note that CAS fail will not incur any error.
func compact(ctx context.Context, client *clientv3.Client, t, rev int64) (int64, int64, error) {
resp, err := client.KV.Txn(ctx).If(
clientv3.Compare(clientv3.Version(compactRevKey), "=", t),
).Then(
clientv3.OpPut(compactRevKey, strconv.FormatInt(rev, 10)), // Expect side effect: increment Version
).Else(
clientv3.OpGet(compactRevKey),
).Commit()
if err != nil {
return t, rev, err
}
curRev := resp.Header.Revision
if !resp.Succeeded {
curTime := resp.Responses[0].GetResponseRange().Kvs[0].Version
return curTime, curRev, nil
}
curTime := t + 1
if rev == 0 {
// We don't compact on bootstrap.
return curTime, curRev, nil
}
if _, err = client.Compact(ctx, rev); err != nil {
return curTime, curRev, err
}
klog.V(4).Infof("etcd: compacted rev (%d), endpoints (%v)", rev, client.Endpoints())
return curTime, curRev, nil
}

72
vendor/k8s.io/apiserver/pkg/storage/etcd3/errors.go generated vendored Normal file
View File

@ -0,0 +1,72 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apiserver/pkg/storage"
etcdrpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
)
func interpretWatchError(err error) error {
switch {
case err == etcdrpc.ErrCompacted:
return errors.NewResourceExpired("The resourceVersion for the provided watch is too old.")
}
return err
}
const (
expired string = "The resourceVersion for the provided list is too old."
continueExpired string = "The provided continue parameter is too old " +
"to display a consistent list result. You can start a new list without " +
"the continue parameter."
inconsistentContinue string = "The provided continue parameter is too old " +
"to display a consistent list result. You can start a new list without " +
"the continue parameter, or use the continue token in this response to " +
"retrieve the remainder of the results. Continuing with the provided " +
"token results in an inconsistent list - objects that were created, " +
"modified, or deleted between the time the first chunk was returned " +
"and now may show up in the list."
)
func interpretListError(err error, paging bool, continueKey, keyPrefix string) error {
switch {
case err == etcdrpc.ErrCompacted:
if paging {
return handleCompactedErrorForPaging(continueKey, keyPrefix)
}
return errors.NewResourceExpired(expired)
}
return err
}
func handleCompactedErrorForPaging(continueKey, keyPrefix string) error {
// continueToken.ResoureVersion=-1 means that the apiserver can
// continue the list at the latest resource version. We don't use rv=0
// for this purpose to distinguish from a bad token that has empty rv.
newToken, err := storage.EncodeContinue(continueKey, keyPrefix, -1)
if err != nil {
utilruntime.HandleError(err)
return errors.NewResourceExpired(continueExpired)
}
statusError := errors.NewResourceExpired(inconsistentContinue)
statusError.ErrStatus.ListMeta.Continue = newToken
return statusError
}

71
vendor/k8s.io/apiserver/pkg/storage/etcd3/event.go generated vendored Normal file
View File

@ -0,0 +1,71 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"fmt"
"go.etcd.io/etcd/api/v3/mvccpb"
clientv3 "go.etcd.io/etcd/client/v3"
)
type event struct {
key string
value []byte
prevValue []byte
rev int64
isDeleted bool
isCreated bool
isProgressNotify bool
}
// parseKV converts a KeyValue retrieved from an initial sync() listing to a synthetic isCreated event.
func parseKV(kv *mvccpb.KeyValue) *event {
return &event{
key: string(kv.Key),
value: kv.Value,
prevValue: nil,
rev: kv.ModRevision,
isDeleted: false,
isCreated: true,
}
}
func parseEvent(e *clientv3.Event) (*event, error) {
if !e.IsCreate() && e.PrevKv == nil {
// If the previous value is nil, error. One example of how this is possible is if the previous value has been compacted already.
return nil, fmt.Errorf("etcd event received with PrevKv=nil (key=%q, modRevision=%d, type=%s)", string(e.Kv.Key), e.Kv.ModRevision, e.Type.String())
}
ret := &event{
key: string(e.Kv.Key),
value: e.Kv.Value,
rev: e.Kv.ModRevision,
isDeleted: e.Type == clientv3.EventTypeDelete,
isCreated: e.IsCreate(),
}
if e.PrevKv != nil {
ret.prevValue = e.PrevKv.Value
}
return ret, nil
}
func progressNotifyEvent(rev int64) *event {
return &event{
rev: rev,
isProgressNotify: true,
}
}

View File

@ -0,0 +1,40 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"encoding/json"
"fmt"
)
// etcdHealth encodes data returned from etcd /healthz handler.
type etcdHealth struct {
// Note this has to be public so the json library can modify it.
Health string `json:"health"`
}
// EtcdHealthCheck decodes data returned from etcd /healthz handler.
func EtcdHealthCheck(data []byte) error {
obj := etcdHealth{}
if err := json.Unmarshal(data, &obj); err != nil {
return err
}
if obj.Health != "true" {
return fmt.Errorf("Unhealthy status: %s", obj.Health)
}
return nil
}

View File

@ -0,0 +1,108 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"context"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
endpointsrequest "k8s.io/apiserver/pkg/endpoints/request"
)
// NewETCDLatencyTracker returns an implementation of
// clientv3.KV that times the calls from the specified
// 'delegate' KV instance in order to track latency incurred.
func NewETCDLatencyTracker(delegate clientv3.KV) clientv3.KV {
return &clientV3KVLatencyTracker{KV: delegate}
}
// clientV3KVLatencyTracker decorates a clientv3.KV instance and times
// each call so we can track the latency an API request incurs in etcd
// round trips (the time it takes to send data to etcd and get the
// complete response back)
//
// If an API request involves N (N>=1) round trips to etcd, then we will sum
// up the latenciy incurred in each roundtrip.
// It uses the context associated with the request in flight, so there
// are no states shared among the requests in flight, and so there is no
// concurrency overhead.
// If the goroutine executing the request handler makes concurrent calls
// to the underlying storage layer, that is protected since the latency
// tracking function TrackStorageLatency is thread safe.
//
// NOTE: Compact is an asynchronous process and is not associated with
//
// any request, so we will not be tracking its latency.
type clientV3KVLatencyTracker struct {
clientv3.KV
}
func (c *clientV3KVLatencyTracker) Put(ctx context.Context, key, val string, opts ...clientv3.OpOption) (*clientv3.PutResponse, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackStorageLatency(ctx, time.Since(startedAt))
}()
return c.KV.Put(ctx, key, val, opts...)
}
func (c *clientV3KVLatencyTracker) Get(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.GetResponse, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackStorageLatency(ctx, time.Since(startedAt))
}()
return c.KV.Get(ctx, key, opts...)
}
func (c *clientV3KVLatencyTracker) Delete(ctx context.Context, key string, opts ...clientv3.OpOption) (*clientv3.DeleteResponse, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackStorageLatency(ctx, time.Since(startedAt))
}()
return c.KV.Delete(ctx, key, opts...)
}
func (c *clientV3KVLatencyTracker) Do(ctx context.Context, op clientv3.Op) (clientv3.OpResponse, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackStorageLatency(ctx, time.Since(startedAt))
}()
return c.KV.Do(ctx, op)
}
func (c *clientV3KVLatencyTracker) Txn(ctx context.Context) clientv3.Txn {
return &clientV3TxnTracker{ctx: ctx, Txn: c.KV.Txn(ctx)}
}
type clientV3TxnTracker struct {
ctx context.Context
clientv3.Txn
}
func (t *clientV3TxnTracker) Commit() (*clientv3.TxnResponse, error) {
startedAt := time.Now()
defer func() {
endpointsrequest.TrackStorageLatency(t.ctx, time.Since(startedAt))
}()
return t.Txn.Commit()
}

View File

@ -0,0 +1,131 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"context"
"sync"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
)
const (
defaultLeaseReuseDurationSeconds = 60
defaultLeaseMaxObjectCount = 1000
)
// LeaseManagerConfig is configuration for creating a lease manager.
type LeaseManagerConfig struct {
// ReuseDurationSeconds specifies time in seconds that each lease is reused
ReuseDurationSeconds int64
// MaxObjectCount specifies how many objects that a lease can attach
MaxObjectCount int64
}
// NewDefaultLeaseManagerConfig creates a LeaseManagerConfig with default values
func NewDefaultLeaseManagerConfig() LeaseManagerConfig {
return LeaseManagerConfig{
ReuseDurationSeconds: defaultLeaseReuseDurationSeconds,
MaxObjectCount: defaultLeaseMaxObjectCount,
}
}
// leaseManager is used to manage leases requested from etcd. If a new write
// needs a lease that has similar expiration time to the previous one, the old
// lease will be reused to reduce the overhead of etcd, since lease operations
// are expensive. In the implementation, we only store one previous lease,
// since all the events have the same ttl.
type leaseManager struct {
client *clientv3.Client // etcd client used to grant leases
leaseMu sync.Mutex
prevLeaseID clientv3.LeaseID
prevLeaseExpirationTime time.Time
// The period of time in seconds and percent of TTL that each lease is
// reused. The minimum of them is used to avoid unreasonably large
// numbers.
leaseReuseDurationSeconds int64
leaseReuseDurationPercent float64
leaseMaxAttachedObjectCount int64
leaseAttachedObjectCount int64
}
// newDefaultLeaseManager creates a new lease manager using default setting.
func newDefaultLeaseManager(client *clientv3.Client, config LeaseManagerConfig) *leaseManager {
if config.MaxObjectCount <= 0 {
config.MaxObjectCount = defaultLeaseMaxObjectCount
}
return newLeaseManager(client, config.ReuseDurationSeconds, 0.05, config.MaxObjectCount)
}
// newLeaseManager creates a new lease manager with the number of buffered
// leases, lease reuse duration in seconds and percentage. The percentage
// value x means x*100%.
func newLeaseManager(client *clientv3.Client, leaseReuseDurationSeconds int64, leaseReuseDurationPercent float64, maxObjectCount int64) *leaseManager {
return &leaseManager{
client: client,
leaseReuseDurationSeconds: leaseReuseDurationSeconds,
leaseReuseDurationPercent: leaseReuseDurationPercent,
leaseMaxAttachedObjectCount: maxObjectCount,
}
}
// GetLease returns a lease based on requested ttl: if the cached previous
// lease can be reused, reuse it; otherwise request a new one from etcd.
func (l *leaseManager) GetLease(ctx context.Context, ttl int64) (clientv3.LeaseID, error) {
now := time.Now()
l.leaseMu.Lock()
defer l.leaseMu.Unlock()
// check if previous lease can be reused
reuseDurationSeconds := l.getReuseDurationSecondsLocked(ttl)
valid := now.Add(time.Duration(ttl) * time.Second).Before(l.prevLeaseExpirationTime)
sufficient := now.Add(time.Duration(ttl+reuseDurationSeconds) * time.Second).After(l.prevLeaseExpirationTime)
// We count all operations that happened in the same lease, regardless of success or failure.
// Currently each GetLease call only attach 1 object
l.leaseAttachedObjectCount++
if valid && sufficient && l.leaseAttachedObjectCount <= l.leaseMaxAttachedObjectCount {
return l.prevLeaseID, nil
}
// request a lease with a little extra ttl from etcd
ttl += reuseDurationSeconds
lcr, err := l.client.Lease.Grant(ctx, ttl)
if err != nil {
return clientv3.LeaseID(0), err
}
// cache the new lease id
l.prevLeaseID = lcr.ID
l.prevLeaseExpirationTime = now.Add(time.Duration(ttl) * time.Second)
// refresh count
metrics.UpdateLeaseObjectCount(l.leaseAttachedObjectCount)
l.leaseAttachedObjectCount = 1
return lcr.ID, nil
}
// getReuseDurationSecondsLocked returns the reusable duration in seconds
// based on the configuration. Lock has to be acquired before calling this
// function.
func (l *leaseManager) getReuseDurationSecondsLocked(ttl int64) int64 {
reuseDurationSeconds := int64(l.leaseReuseDurationPercent * float64(ttl))
if reuseDurationSeconds > l.leaseReuseDurationSeconds {
reuseDurationSeconds = l.leaseReuseDurationSeconds
}
return reuseDurationSeconds
}

90
vendor/k8s.io/apiserver/pkg/storage/etcd3/logger.go generated vendored Normal file
View File

@ -0,0 +1,90 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"fmt"
"google.golang.org/grpc/grpclog"
"k8s.io/klog/v2"
)
func init() {
grpclog.SetLoggerV2(klogWrapper{})
}
type klogWrapper struct{}
const klogWrapperDepth = 4
func (klogWrapper) Info(args ...interface{}) {
if klogV := klog.V(5); klogV.Enabled() {
klogV.InfoSDepth(klogWrapperDepth, fmt.Sprint(args...))
}
}
func (klogWrapper) Infoln(args ...interface{}) {
if klogV := klog.V(5); klogV.Enabled() {
klogV.InfoSDepth(klogWrapperDepth, fmt.Sprintln(args...))
}
}
func (klogWrapper) Infof(format string, args ...interface{}) {
if klogV := klog.V(5); klogV.Enabled() {
klog.V(5).InfoSDepth(klogWrapperDepth, fmt.Sprintf(format, args...))
}
}
func (klogWrapper) Warning(args ...interface{}) {
klog.WarningDepth(klogWrapperDepth, args...)
}
func (klogWrapper) Warningln(args ...interface{}) {
klog.WarningDepth(klogWrapperDepth, fmt.Sprintln(args...))
}
func (klogWrapper) Warningf(format string, args ...interface{}) {
klog.WarningDepth(klogWrapperDepth, fmt.Sprintf(format, args...))
}
func (klogWrapper) Error(args ...interface{}) {
klog.ErrorDepth(klogWrapperDepth, args...)
}
func (klogWrapper) Errorln(args ...interface{}) {
klog.ErrorDepth(klogWrapperDepth, fmt.Sprintln(args...))
}
func (klogWrapper) Errorf(format string, args ...interface{}) {
klog.ErrorDepth(klogWrapperDepth, fmt.Sprintf(format, args...))
}
func (klogWrapper) Fatal(args ...interface{}) {
klog.FatalDepth(klogWrapperDepth, args...)
}
func (klogWrapper) Fatalln(args ...interface{}) {
klog.FatalDepth(klogWrapperDepth, fmt.Sprintln(args...))
}
func (klogWrapper) Fatalf(format string, args ...interface{}) {
klog.FatalDepth(klogWrapperDepth, fmt.Sprintf(format, args...))
}
func (klogWrapper) V(l int) bool {
return bool(klog.V(klog.Level(l)).Enabled())
}

View File

@ -0,0 +1,4 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- logicalhan

View File

@ -0,0 +1,179 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
etcdRequestLatency = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Name: "etcd_request_duration_seconds",
Help: "Etcd request latency in seconds for each operation and object type.",
// Etcd request latency in seconds for each operation and object type.
// This metric is used for verifying etcd api call latencies SLO
// keep consistent with apiserver metric 'requestLatencies' in
// staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
4, 5, 6, 8, 10, 15, 20, 30, 45, 60},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "apiserver_storage_objects",
Help: "Number of stored objects at the time of last check split by kind.",
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"resource"},
)
dbTotalSize = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Subsystem: "apiserver",
Name: "storage_db_total_size_in_bytes",
Help: "Total size of the storage database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"endpoint"},
)
etcdBookmarkCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_bookmark_counts",
Help: "Number of etcd bookmarks (progress notify events) split by kind.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
etcdLeaseObjectCounts = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Name: "etcd_lease_object_counts",
Help: "Number of objects attached to a single etcd lease.",
Buckets: []float64{10, 50, 100, 500, 1000, 2500, 5000},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{},
)
listStorageCount = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_storage_list_total",
Help: "Number of LIST requests served from storage",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
listStorageNumFetched = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_storage_list_fetched_objects_total",
Help: "Number of objects read from storage in the course of serving a LIST request",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
listStorageNumSelectorEvals = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_storage_list_evaluated_objects_total",
Help: "Number of objects tested in the course of serving a LIST request from storage",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
listStorageNumReturned = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_storage_list_returned_objects_total",
Help: "Number of objects returned for a LIST request from storage",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
)
var registerMetrics sync.Once
// Register all metrics.
func Register() {
// Register the metrics.
registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.MustRegister(etcdBookmarkCounts)
legacyregistry.MustRegister(etcdLeaseObjectCounts)
legacyregistry.MustRegister(listStorageCount)
legacyregistry.MustRegister(listStorageNumFetched)
legacyregistry.MustRegister(listStorageNumSelectorEvals)
legacyregistry.MustRegister(listStorageNumReturned)
})
}
// UpdateObjectCount sets the apiserver_storage_object_counts metric.
func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
}
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics.
func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
}
// RecordEtcdBookmark updates the etcd_bookmark_counts metric.
func RecordEtcdBookmark(resource string) {
etcdBookmarkCounts.WithLabelValues(resource).Inc()
}
// Reset resets the etcd_request_duration_seconds metric.
func Reset() {
etcdRequestLatency.Reset()
}
// sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
// UpdateEtcdDbSize sets the etcd_db_total_size_in_bytes metric.
func UpdateEtcdDbSize(ep string, size int64) {
dbTotalSize.WithLabelValues(ep).Set(float64(size))
}
// UpdateLeaseObjectCount sets the etcd_lease_object_counts metric.
func UpdateLeaseObjectCount(count int64) {
// Currently we only store one previous lease, since all the events have the same ttl.
// See pkg/storage/etcd3/lease_manager.go
etcdLeaseObjectCounts.WithLabelValues().Observe(float64(count))
}
// RecordListEtcd3Metrics notes various metrics of the cost to serve a LIST request
func RecordStorageListMetrics(resource string, numFetched, numEvald, numReturned int) {
listStorageCount.WithLabelValues(resource).Inc()
listStorageNumFetched.WithLabelValues(resource).Add(float64(numFetched))
listStorageNumSelectorEvals.WithLabelValues(resource).Add(float64(numEvald))
listStorageNumReturned.WithLabelValues(resource).Add(float64(numReturned))
}

1028
vendor/k8s.io/apiserver/pkg/storage/etcd3/store.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

466
vendor/k8s.io/apiserver/pkg/storage/etcd3/watcher.go generated vendored Normal file
View File

@ -0,0 +1,466 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package etcd3
import (
"context"
"fmt"
"os"
"reflect"
"strconv"
"strings"
"sync"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/value"
utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol"
clientv3 "go.etcd.io/etcd/client/v3"
"k8s.io/klog/v2"
)
const (
// We have set a buffer in order to reduce times of context switches.
incomingBufSize = 100
outgoingBufSize = 100
)
// fatalOnDecodeError is used during testing to panic the server if watcher encounters a decoding error
var fatalOnDecodeError = false
func init() {
// check to see if we are running in a test environment
TestOnlySetFatalOnDecodeError(true)
fatalOnDecodeError, _ = strconv.ParseBool(os.Getenv("KUBE_PANIC_WATCH_DECODE_ERROR"))
}
// TestOnlySetFatalOnDecodeError should only be used for cases where decode errors are expected and need to be tested. e.g. conversion webhooks.
func TestOnlySetFatalOnDecodeError(b bool) {
fatalOnDecodeError = b
}
type watcher struct {
client *clientv3.Client
codec runtime.Codec
newFunc func() runtime.Object
objectType string
groupResource schema.GroupResource
versioner storage.Versioner
}
// watchChan implements watch.Interface.
type watchChan struct {
watcher *watcher
transformer value.Transformer
key string
initialRev int64
recursive bool
progressNotify bool
internalPred storage.SelectionPredicate
ctx context.Context
cancel context.CancelFunc
incomingEventChan chan *event
resultChan chan watch.Event
errChan chan error
}
func newWatcher(client *clientv3.Client, codec runtime.Codec, groupResource schema.GroupResource, newFunc func() runtime.Object, versioner storage.Versioner) *watcher {
res := &watcher{
client: client,
codec: codec,
groupResource: groupResource,
newFunc: newFunc,
versioner: versioner,
}
if newFunc == nil {
res.objectType = "<unknown>"
} else {
res.objectType = reflect.TypeOf(newFunc()).String()
}
return res
}
// Watch watches on a key and returns a watch.Interface that transfers relevant notifications.
// If rev is zero, it will return the existing object(s) and then start watching from
// the maximum revision+1 from returned objects.
// If rev is non-zero, it will watch events happened after given revision.
// If recursive is false, it watches on given key.
// If recursive is true, it watches any children and directories under the key, excluding the root key itself.
// pred must be non-nil. Only if pred matches the change, it will be returned.
func (w *watcher) Watch(ctx context.Context, key string, rev int64, recursive, progressNotify bool, transformer value.Transformer, pred storage.SelectionPredicate) (watch.Interface, error) {
if recursive && !strings.HasSuffix(key, "/") {
key += "/"
}
wc := w.createWatchChan(ctx, key, rev, recursive, progressNotify, transformer, pred)
go wc.run()
// For etcd watch we don't have an easy way to answer whether the watch
// has already caught up. So in the initial version (given that watchcache
// is by default enabled for all resources but Events), we just deliver
// the initialization signal immediately. Improving this will be explored
// in the future.
utilflowcontrol.WatchInitialized(ctx)
return wc, nil
}
func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive, progressNotify bool, transformer value.Transformer, pred storage.SelectionPredicate) *watchChan {
wc := &watchChan{
watcher: w,
transformer: transformer,
key: key,
initialRev: rev,
recursive: recursive,
progressNotify: progressNotify,
internalPred: pred,
incomingEventChan: make(chan *event, incomingBufSize),
resultChan: make(chan watch.Event, outgoingBufSize),
errChan: make(chan error, 1),
}
if pred.Empty() {
// The filter doesn't filter out any object.
wc.internalPred = storage.Everything
}
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
wc.ctx, wc.cancel = context.WithCancel(clientv3.WithRequireLeader(ctx))
return wc
}
func (wc *watchChan) run() {
watchClosedCh := make(chan struct{})
go wc.startWatching(watchClosedCh)
var resultChanWG sync.WaitGroup
resultChanWG.Add(1)
go wc.processEvent(&resultChanWG)
select {
case err := <-wc.errChan:
if err == context.Canceled {
break
}
errResult := transformErrorToEvent(err)
if errResult != nil {
// error result is guaranteed to be received by user before closing ResultChan.
select {
case wc.resultChan <- *errResult:
case <-wc.ctx.Done(): // user has given up all results
}
}
case <-watchClosedCh:
case <-wc.ctx.Done(): // user cancel
}
// We use wc.ctx to reap all goroutines. Under whatever condition, we should stop them all.
// It's fine to double cancel.
wc.cancel()
// we need to wait until resultChan wouldn't be used anymore
resultChanWG.Wait()
close(wc.resultChan)
}
func (wc *watchChan) Stop() {
wc.cancel()
}
func (wc *watchChan) ResultChan() <-chan watch.Event {
return wc.resultChan
}
// sync tries to retrieve existing data and send them to process.
// The revision to watch will be set to the revision in response.
// All events sent will have isCreated=true
func (wc *watchChan) sync() error {
opts := []clientv3.OpOption{}
if wc.recursive {
opts = append(opts, clientv3.WithPrefix())
}
getResp, err := wc.watcher.client.Get(wc.ctx, wc.key, opts...)
if err != nil {
return err
}
wc.initialRev = getResp.Header.Revision
for _, kv := range getResp.Kvs {
wc.sendEvent(parseKV(kv))
}
return nil
}
// logWatchChannelErr checks whether the error is about mvcc revision compaction which is regarded as warning
func logWatchChannelErr(err error) {
if !strings.Contains(err.Error(), "mvcc: required revision has been compacted") {
klog.Errorf("watch chan error: %v", err)
} else {
klog.Warningf("watch chan error: %v", err)
}
}
// startWatching does:
// - get current objects if initialRev=0; set initialRev to current rev
// - watch on given key and send events to process.
func (wc *watchChan) startWatching(watchClosedCh chan struct{}) {
if wc.initialRev == 0 {
if err := wc.sync(); err != nil {
klog.Errorf("failed to sync with latest state: %v", err)
wc.sendError(err)
return
}
}
opts := []clientv3.OpOption{clientv3.WithRev(wc.initialRev + 1), clientv3.WithPrevKV()}
if wc.recursive {
opts = append(opts, clientv3.WithPrefix())
}
if wc.progressNotify {
opts = append(opts, clientv3.WithProgressNotify())
}
wch := wc.watcher.client.Watch(wc.ctx, wc.key, opts...)
for wres := range wch {
if wres.Err() != nil {
err := wres.Err()
// If there is an error on server (e.g. compaction), the channel will return it before closed.
logWatchChannelErr(err)
wc.sendError(err)
return
}
if wres.IsProgressNotify() {
wc.sendEvent(progressNotifyEvent(wres.Header.GetRevision()))
metrics.RecordEtcdBookmark(wc.watcher.groupResource.String())
continue
}
for _, e := range wres.Events {
parsedEvent, err := parseEvent(e)
if err != nil {
logWatchChannelErr(err)
wc.sendError(err)
return
}
wc.sendEvent(parsedEvent)
}
}
// When we come to this point, it's only possible that client side ends the watch.
// e.g. cancel the context, close the client.
// If this watch chan is broken and context isn't cancelled, other goroutines will still hang.
// We should notify the main thread that this goroutine has exited.
close(watchClosedCh)
}
// processEvent processes events from etcd watcher and sends results to resultChan.
func (wc *watchChan) processEvent(wg *sync.WaitGroup) {
defer wg.Done()
for {
select {
case e := <-wc.incomingEventChan:
res := wc.transform(e)
if res == nil {
continue
}
if len(wc.resultChan) == outgoingBufSize {
klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow dispatching events to watchers", "outgoingEvents", outgoingBufSize, "objectType", wc.watcher.objectType, "groupResource", wc.watcher.groupResource)
}
// If user couldn't receive results fast enough, we also block incoming events from watcher.
// Because storing events in local will cause more memory usage.
// The worst case would be closing the fast watcher.
select {
case wc.resultChan <- *res:
case <-wc.ctx.Done():
return
}
case <-wc.ctx.Done():
return
}
}
}
func (wc *watchChan) filter(obj runtime.Object) bool {
if wc.internalPred.Empty() {
return true
}
matched, err := wc.internalPred.Matches(obj)
return err == nil && matched
}
func (wc *watchChan) acceptAll() bool {
return wc.internalPred.Empty()
}
// transform transforms an event into a result for user if not filtered.
func (wc *watchChan) transform(e *event) (res *watch.Event) {
curObj, oldObj, err := wc.prepareObjs(e)
if err != nil {
klog.Errorf("failed to prepare current and previous objects: %v", err)
wc.sendError(err)
return nil
}
switch {
case e.isProgressNotify:
if wc.watcher.newFunc == nil {
return nil
}
object := wc.watcher.newFunc()
if err := wc.watcher.versioner.UpdateObject(object, uint64(e.rev)); err != nil {
klog.Errorf("failed to propagate object version: %v", err)
return nil
}
res = &watch.Event{
Type: watch.Bookmark,
Object: object,
}
case e.isDeleted:
if !wc.filter(oldObj) {
return nil
}
res = &watch.Event{
Type: watch.Deleted,
Object: oldObj,
}
case e.isCreated:
if !wc.filter(curObj) {
return nil
}
res = &watch.Event{
Type: watch.Added,
Object: curObj,
}
default:
if wc.acceptAll() {
res = &watch.Event{
Type: watch.Modified,
Object: curObj,
}
return res
}
curObjPasses := wc.filter(curObj)
oldObjPasses := wc.filter(oldObj)
switch {
case curObjPasses && oldObjPasses:
res = &watch.Event{
Type: watch.Modified,
Object: curObj,
}
case curObjPasses && !oldObjPasses:
res = &watch.Event{
Type: watch.Added,
Object: curObj,
}
case !curObjPasses && oldObjPasses:
res = &watch.Event{
Type: watch.Deleted,
Object: oldObj,
}
}
}
return res
}
func transformErrorToEvent(err error) *watch.Event {
err = interpretWatchError(err)
if _, ok := err.(apierrors.APIStatus); !ok {
err = apierrors.NewInternalError(err)
}
status := err.(apierrors.APIStatus).Status()
return &watch.Event{
Type: watch.Error,
Object: &status,
}
}
func (wc *watchChan) sendError(err error) {
select {
case wc.errChan <- err:
case <-wc.ctx.Done():
}
}
func (wc *watchChan) sendEvent(e *event) {
if len(wc.incomingEventChan) == incomingBufSize {
klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow decoding, user not receiving fast, or other processing logic", "incomingEvents", incomingBufSize, "objectType", wc.watcher.objectType, "groupResource", wc.watcher.groupResource)
}
select {
case wc.incomingEventChan <- e:
case <-wc.ctx.Done():
}
}
func (wc *watchChan) prepareObjs(e *event) (curObj runtime.Object, oldObj runtime.Object, err error) {
if e.isProgressNotify {
// progressNotify events doesn't contain neither current nor previous object version,
return nil, nil, nil
}
if !e.isDeleted {
data, _, err := wc.transformer.TransformFromStorage(wc.ctx, e.value, authenticatedDataString(e.key))
if err != nil {
return nil, nil, err
}
curObj, err = decodeObj(wc.watcher.codec, wc.watcher.versioner, data, e.rev)
if err != nil {
return nil, nil, err
}
}
// We need to decode prevValue, only if this is deletion event or
// the underlying filter doesn't accept all objects (otherwise we
// know that the filter for previous object will return true and
// we need the object only to compute whether it was filtered out
// before).
if len(e.prevValue) > 0 && (e.isDeleted || !wc.acceptAll()) {
data, _, err := wc.transformer.TransformFromStorage(wc.ctx, e.prevValue, authenticatedDataString(e.key))
if err != nil {
return nil, nil, err
}
// Note that this sends the *old* object with the etcd revision for the time at
// which it gets deleted.
oldObj, err = decodeObj(wc.watcher.codec, wc.watcher.versioner, data, e.rev)
if err != nil {
return nil, nil, err
}
}
return curObj, oldObj, nil
}
func decodeObj(codec runtime.Codec, versioner storage.Versioner, data []byte, rev int64) (_ runtime.Object, err error) {
obj, err := runtime.Decode(codec, []byte(data))
if err != nil {
if fatalOnDecodeError {
// we are running in a test environment and thus an
// error here is due to a coder mistake if the defer
// does not catch it
panic(err)
}
return nil, err
}
// ensure resource version is set on the object we load from etcd
if err := versioner.UpdateObject(obj, uint64(rev)); err != nil {
return nil, fmt.Errorf("failure to version api object (%d) %#v: %v", rev, obj, err)
}
return obj, nil
}

271
vendor/k8s.io/apiserver/pkg/storage/interfaces.go generated vendored Normal file
View File

@ -0,0 +1,271 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/watch"
)
// Versioner abstracts setting and retrieving metadata fields from database response
// onto the object ot list. It is required to maintain storage invariants - updating an
// object twice with the same data except for the ResourceVersion and SelfLink must be
// a no-op. A resourceVersion of type uint64 is a 'raw' resourceVersion,
// intended to be sent directly to or from the backend. A resourceVersion of
// type string is a 'safe' resourceVersion, intended for consumption by users.
type Versioner interface {
// UpdateObject sets storage metadata into an API object. Returns an error if the object
// cannot be updated correctly. May return nil if the requested object does not need metadata
// from database.
UpdateObject(obj runtime.Object, resourceVersion uint64) error
// UpdateList sets the resource version into an API list object. Returns an error if the object
// cannot be updated correctly. May return nil if the requested object does not need metadata from
// database. continueValue is optional and indicates that more results are available if the client
// passes that value to the server in a subsequent call. remainingItemCount indicates the number
// of remaining objects if the list is partial. The remainingItemCount field is omitted during
// serialization if it is set to nil.
UpdateList(obj runtime.Object, resourceVersion uint64, continueValue string, remainingItemCount *int64) error
// PrepareObjectForStorage should set SelfLink and ResourceVersion to the empty value. Should
// return an error if the specified object cannot be updated.
PrepareObjectForStorage(obj runtime.Object) error
// ObjectResourceVersion returns the resource version (for persistence) of the specified object.
// Should return an error if the specified object does not have a persistable version.
ObjectResourceVersion(obj runtime.Object) (uint64, error)
// ParseResourceVersion takes a resource version argument and
// converts it to the storage backend. For watch we should pass to helper.Watch().
// Because resourceVersion is an opaque value, the default watch
// behavior for non-zero watch is to watch the next value (if you pass
// "1", you will see updates from "2" onwards).
ParseResourceVersion(resourceVersion string) (uint64, error)
}
// ResponseMeta contains information about the database metadata that is associated with
// an object. It abstracts the actual underlying objects to prevent coupling with concrete
// database and to improve testability.
type ResponseMeta struct {
// TTL is the time to live of the node that contained the returned object. It may be
// zero or negative in some cases (objects may be expired after the requested
// expiration time due to server lag).
TTL int64
// The resource version of the node that contained the returned object.
ResourceVersion uint64
}
// IndexerFunc is a function that for a given object computes
// `<value of an index>` for a particular `<index>`.
type IndexerFunc func(obj runtime.Object) string
// IndexerFuncs is a mapping from `<index name>` to function that
// for a given object computes `<value for that index>`.
type IndexerFuncs map[string]IndexerFunc
// Everything accepts all objects.
var Everything = SelectionPredicate{
Label: labels.Everything(),
Field: fields.Everything(),
}
// MatchValue defines a pair (`<index name>`, `<value for that index>`).
type MatchValue struct {
IndexName string
Value string
}
// Pass an UpdateFunc to Interface.GuaranteedUpdate to make an update
// that is guaranteed to succeed.
// See the comment for GuaranteedUpdate for more details.
type UpdateFunc func(input runtime.Object, res ResponseMeta) (output runtime.Object, ttl *uint64, err error)
// ValidateObjectFunc is a function to act on a given object. An error may be returned
// if the hook cannot be completed. The function may NOT transform the provided
// object.
type ValidateObjectFunc func(ctx context.Context, obj runtime.Object) error
// ValidateAllObjectFunc is a "admit everything" instance of ValidateObjectFunc.
func ValidateAllObjectFunc(ctx context.Context, obj runtime.Object) error {
return nil
}
// Preconditions must be fulfilled before an operation (update, delete, etc.) is carried out.
type Preconditions struct {
// Specifies the target UID.
// +optional
UID *types.UID `json:"uid,omitempty"`
// Specifies the target ResourceVersion
// +optional
ResourceVersion *string `json:"resourceVersion,omitempty"`
}
// NewUIDPreconditions returns a Preconditions with UID set.
func NewUIDPreconditions(uid string) *Preconditions {
u := types.UID(uid)
return &Preconditions{UID: &u}
}
func (p *Preconditions) Check(key string, obj runtime.Object) error {
if p == nil {
return nil
}
objMeta, err := meta.Accessor(obj)
if err != nil {
return NewInternalErrorf(
"can't enforce preconditions %v on un-introspectable object %v, got error: %v",
*p,
obj,
err)
}
if p.UID != nil && *p.UID != objMeta.GetUID() {
err := fmt.Sprintf(
"Precondition failed: UID in precondition: %v, UID in object meta: %v",
*p.UID,
objMeta.GetUID())
return NewInvalidObjError(key, err)
}
if p.ResourceVersion != nil && *p.ResourceVersion != objMeta.GetResourceVersion() {
err := fmt.Sprintf(
"Precondition failed: ResourceVersion in precondition: %v, ResourceVersion in object meta: %v",
*p.ResourceVersion,
objMeta.GetResourceVersion())
return NewInvalidObjError(key, err)
}
return nil
}
// Interface offers a common interface for object marshaling/unmarshaling operations and
// hides all the storage-related operations behind it.
type Interface interface {
// Returns Versioner associated with this interface.
Versioner() Versioner
// Create adds a new object at a key unless it already exists. 'ttl' is time-to-live
// in seconds (0 means forever). If no error is returned and out is not nil, out will be
// set to the read value from database.
Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error
// Delete removes the specified key and returns the value that existed at that spot.
// If key didn't exist, it will return NotFound storage error.
// If 'cachedExistingObject' is non-nil, it can be used as a suggestion about the
// current version of the object to avoid read operation from storage to get it.
// However, the implementations have to retry in case suggestion is stale.
Delete(
ctx context.Context, key string, out runtime.Object, preconditions *Preconditions,
validateDeletion ValidateObjectFunc, cachedExistingObject runtime.Object) error
// Watch begins watching the specified key. Events are decoded into API objects,
// and any items selected by 'p' are sent down to returned watch.Interface.
// resourceVersion may be used to specify what version to begin watching,
// which should be the current resourceVersion, and no longer rv+1
// (e.g. reconnecting without missing any updates).
// If resource version is "0", this interface will get current object at given key
// and send it in an "ADDED" event, before watch starts.
Watch(ctx context.Context, key string, opts ListOptions) (watch.Interface, error)
// Get unmarshals object found at key into objPtr. On a not found error, will either
// return a zero object of the requested type, or an error, depending on 'opts.ignoreNotFound'.
// Treats empty responses and nil response nodes exactly like a not found error.
// The returned contents may be delayed, but it is guaranteed that they will
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
Get(ctx context.Context, key string, opts GetOptions, objPtr runtime.Object) error
// GetList unmarshalls objects found at key into a *List api object (an object
// that satisfies runtime.IsList definition).
// If 'opts.Recursive' is false, 'key' is used as an exact match. If `opts.Recursive'
// is true, 'key' is used as a prefix.
// The returned contents may be delayed, but it is guaranteed that they will
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
GetList(ctx context.Context, key string, opts ListOptions, listObj runtime.Object) error
// GuaranteedUpdate keeps calling 'tryUpdate()' to update key 'key' (of type 'destination')
// retrying the update until success if there is index conflict.
// Note that object passed to tryUpdate may change across invocations of tryUpdate() if
// other writers are simultaneously updating it, so tryUpdate() needs to take into account
// the current contents of the object when deciding how the update object should look.
// If the key doesn't exist, it will return NotFound storage error if ignoreNotFound=false
// else `destination` will be set to the zero value of it's type.
// If the eventual successful invocation of `tryUpdate` returns an output with the same serialized
// contents as the input, it won't perform any update, but instead set `destination` to an object with those
// contents.
// If 'cachedExistingObject' is non-nil, it can be used as a suggestion about the
// current version of the object to avoid read operation from storage to get it.
// However, the implementations have to retry in case suggestion is stale.
//
// Example:
//
// s := /* implementation of Interface */
// err := s.GuaranteedUpdate(
// "myKey", &MyType{}, true, preconditions,
// func(input runtime.Object, res ResponseMeta) (runtime.Object, *uint64, error) {
// // Before each invocation of the user defined function, "input" is reset to
// // current contents for "myKey" in database.
// curr := input.(*MyType) // Guaranteed to succeed.
//
// // Make the modification
// curr.Counter++
//
// // Return the modified object - return an error to stop iterating. Return
// // a uint64 to alter the TTL on the object, or nil to keep it the same value.
// return cur, nil, nil
// }, cachedExistingObject
// )
GuaranteedUpdate(
ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool,
preconditions *Preconditions, tryUpdate UpdateFunc, cachedExistingObject runtime.Object) error
// Count returns number of different entries under the key (generally being path prefix).
Count(key string) (int64, error)
}
// GetOptions provides the options that may be provided for storage get operations.
type GetOptions struct {
// IgnoreNotFound determines what is returned if the requested object is not found. If
// true, a zero object is returned. If false, an error is returned.
IgnoreNotFound bool
// ResourceVersion provides a resource version constraint to apply to the get operation
// as a "not older than" constraint: the result contains data at least as new as the provided
// ResourceVersion. The newest available data is preferred, but any data not older than this
// ResourceVersion may be served.
ResourceVersion string
}
// ListOptions provides the options that may be provided for storage list operations.
type ListOptions struct {
// ResourceVersion provides a resource version constraint to apply to the list operation
// as a "not older than" constraint: the result contains data at least as new as the provided
// ResourceVersion. The newest available data is preferred, but any data not older than this
// ResourceVersion may be served.
ResourceVersion string
// ResourceVersionMatch provides the rule for how the resource version constraint applies. If set
// to the default value "" the legacy resource version semantic apply.
ResourceVersionMatch metav1.ResourceVersionMatch
// Predicate provides the selection rules for the list operation.
Predicate SelectionPredicate
// Recursive determines whether the list or watch is defined for a single object located at the
// given key, or for the whole set of objects with the given key as a prefix.
Recursive bool
// ProgressNotify determines whether storage-originated bookmark (progress notify) events should
// be delivered to the users. The option is ignored for non-watch requests.
ProgressNotify bool
}

View File

@ -0,0 +1,159 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
)
// AttrFunc returns label and field sets and the uninitialized flag for List or Watch to match.
// In any failure to parse given object, it returns error.
type AttrFunc func(obj runtime.Object) (labels.Set, fields.Set, error)
// FieldMutationFunc allows the mutation of the field selection fields. It is mutating to
// avoid the extra allocation on this common path
type FieldMutationFunc func(obj runtime.Object, fieldSet fields.Set) error
func DefaultClusterScopedAttr(obj runtime.Object) (labels.Set, fields.Set, error) {
metadata, err := meta.Accessor(obj)
if err != nil {
return nil, nil, err
}
fieldSet := fields.Set{
"metadata.name": metadata.GetName(),
}
return labels.Set(metadata.GetLabels()), fieldSet, nil
}
func DefaultNamespaceScopedAttr(obj runtime.Object) (labels.Set, fields.Set, error) {
metadata, err := meta.Accessor(obj)
if err != nil {
return nil, nil, err
}
fieldSet := fields.Set{
"metadata.name": metadata.GetName(),
"metadata.namespace": metadata.GetNamespace(),
}
return labels.Set(metadata.GetLabels()), fieldSet, nil
}
func (f AttrFunc) WithFieldMutation(fieldMutator FieldMutationFunc) AttrFunc {
return func(obj runtime.Object) (labels.Set, fields.Set, error) {
labelSet, fieldSet, err := f(obj)
if err != nil {
return nil, nil, err
}
if err := fieldMutator(obj, fieldSet); err != nil {
return nil, nil, err
}
return labelSet, fieldSet, nil
}
}
// SelectionPredicate is used to represent the way to select objects from api storage.
type SelectionPredicate struct {
Label labels.Selector
Field fields.Selector
GetAttrs AttrFunc
IndexLabels []string
IndexFields []string
Limit int64
Continue string
AllowWatchBookmarks bool
}
// Matches returns true if the given object's labels and fields (as
// returned by s.GetAttrs) match s.Label and s.Field. An error is
// returned if s.GetAttrs fails.
func (s *SelectionPredicate) Matches(obj runtime.Object) (bool, error) {
if s.Empty() {
return true, nil
}
labels, fields, err := s.GetAttrs(obj)
if err != nil {
return false, err
}
matched := s.Label.Matches(labels)
if matched && s.Field != nil {
matched = matched && s.Field.Matches(fields)
}
return matched, nil
}
// MatchesObjectAttributes returns true if the given labels and fields
// match s.Label and s.Field.
func (s *SelectionPredicate) MatchesObjectAttributes(l labels.Set, f fields.Set) bool {
if s.Label.Empty() && s.Field.Empty() {
return true
}
matched := s.Label.Matches(l)
if matched && s.Field != nil {
matched = (matched && s.Field.Matches(f))
}
return matched
}
// MatchesSingle will return (name, true) if and only if s.Field matches on the object's
// name.
func (s *SelectionPredicate) MatchesSingle() (string, bool) {
if len(s.Continue) > 0 {
return "", false
}
// TODO: should be namespace.name
if name, ok := s.Field.RequiresExactMatch("metadata.name"); ok {
return name, true
}
return "", false
}
// Empty returns true if the predicate performs no filtering.
func (s *SelectionPredicate) Empty() bool {
return s.Label.Empty() && s.Field.Empty()
}
// For any index defined by IndexFields, if a matcher can match only (a subset)
// of objects that return <value> for a given index, a pair (<index name>, <value>)
// wil be returned.
func (s *SelectionPredicate) MatcherIndex() []MatchValue {
var result []MatchValue
for _, field := range s.IndexFields {
if value, ok := s.Field.RequiresExactMatch(field); ok {
result = append(result, MatchValue{IndexName: FieldIndex(field), Value: value})
}
}
for _, label := range s.IndexLabels {
if value, ok := s.Label.RequiresExactMatch(label); ok {
result = append(result, MatchValue{IndexName: LabelIndex(label), Value: value})
}
}
return result
}
// LabelIndex add prefix for label index.
func LabelIndex(label string) string {
return "l:" + label
}
// FiledIndex add prefix for field index.
func FieldIndex(field string) string {
return "f:" + field
}

View File

@ -0,0 +1,6 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- lavalamp
- smarterclayton
- wojtek-t

View File

@ -0,0 +1,128 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storagebackend
import (
"time"
oteltrace "go.opentelemetry.io/otel/trace"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apiserver/pkg/server/egressselector"
"k8s.io/apiserver/pkg/storage/etcd3"
"k8s.io/apiserver/pkg/storage/value"
flowcontrolrequest "k8s.io/apiserver/pkg/util/flowcontrol/request"
)
const (
StorageTypeUnset = ""
StorageTypeETCD2 = "etcd2"
StorageTypeETCD3 = "etcd3"
DefaultCompactInterval = 5 * time.Minute
DefaultDBMetricPollInterval = 30 * time.Second
DefaultHealthcheckTimeout = 2 * time.Second
DefaultReadinessTimeout = 2 * time.Second
)
// TransportConfig holds all connection related info, i.e. equal TransportConfig means equal servers we talk to.
type TransportConfig struct {
// ServerList is the list of storage servers to connect with.
ServerList []string
// TLS credentials
KeyFile string
CertFile string
TrustedCAFile string
// function to determine the egress dialer. (i.e. konnectivity server dialer)
EgressLookup egressselector.Lookup
// The TracerProvider can add tracing the connection
TracerProvider oteltrace.TracerProvider
}
// Config is configuration for creating a storage backend.
type Config struct {
// Type defines the type of storage backend. Default ("") is "etcd3".
Type string
// Prefix is the prefix to all keys passed to storage.Interface methods.
Prefix string
// Transport holds all connection related info, i.e. equal TransportConfig means equal servers we talk to.
Transport TransportConfig
// Paging indicates whether the server implementation should allow paging (if it is
// supported). This is generally configured by feature gating, or by a specific
// resource type not wishing to allow paging, and is not intended for end users to
// set.
Paging bool
Codec runtime.Codec
// EncodeVersioner is the same groupVersioner used to build the
// storage encoder. Given a list of kinds the input object might belong
// to, the EncodeVersioner outputs the gvk the object will be
// converted to before persisted in etcd.
EncodeVersioner runtime.GroupVersioner
// Transformer allows the value to be transformed prior to persisting into etcd.
Transformer value.Transformer
// CompactionInterval is an interval of requesting compaction from apiserver.
// If the value is 0, no compaction will be issued.
CompactionInterval time.Duration
// CountMetricPollPeriod specifies how often should count metric be updated
CountMetricPollPeriod time.Duration
// DBMetricPollInterval specifies how often should storage backend metric be updated.
DBMetricPollInterval time.Duration
// HealthcheckTimeout specifies the timeout used when checking health
HealthcheckTimeout time.Duration
// ReadycheckTimeout specifies the timeout used when checking readiness
ReadycheckTimeout time.Duration
LeaseManagerConfig etcd3.LeaseManagerConfig
// StorageObjectCountTracker is used to keep track of the total
// number of objects in the storage per resource.
StorageObjectCountTracker flowcontrolrequest.StorageObjectCountTracker
}
// ConfigForResource is a Config specialized to a particular `schema.GroupResource`
type ConfigForResource struct {
// Config is the resource-independent configuration
Config
// GroupResource is the relevant one
GroupResource schema.GroupResource
}
// ForResource specializes to the given resource
func (config *Config) ForResource(resource schema.GroupResource) *ConfigForResource {
return &ConfigForResource{
Config: *config,
GroupResource: resource,
}
}
func NewDefaultConfig(prefix string, codec runtime.Codec) *Config {
return &Config{
Paging: true,
Prefix: prefix,
Codec: codec,
CompactionInterval: DefaultCompactInterval,
DBMetricPollInterval: DefaultDBMetricPollInterval,
HealthcheckTimeout: DefaultHealthcheckTimeout,
ReadycheckTimeout: DefaultReadinessTimeout,
LeaseManagerConfig: etcd3.NewDefaultLeaseManagerConfig(),
Transport: TransportConfig{TracerProvider: oteltrace.NewNoopTracerProvider()},
}
}

View File

@ -0,0 +1,434 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package factory
import (
"context"
"fmt"
"log"
"net"
"net/url"
"os"
"path"
"strings"
"sync"
"time"
grpcprom "github.com/grpc-ecosystem/go-grpc-prometheus"
"go.etcd.io/etcd/client/pkg/v3/logutil"
"go.etcd.io/etcd/client/pkg/v3/transport"
clientv3 "go.etcd.io/etcd/client/v3"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"golang.org/x/time/rate"
"google.golang.org/grpc"
"k8s.io/apimachinery/pkg/runtime"
utilnet "k8s.io/apimachinery/pkg/util/net"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
genericfeatures "k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/server/egressselector"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/storagebackend"
"k8s.io/apiserver/pkg/storage/value/encrypt/identity"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/metrics/legacyregistry"
tracing "k8s.io/component-base/tracing"
"k8s.io/klog/v2"
)
const (
// The short keepalive timeout and interval have been chosen to aggressively
// detect a failed etcd server without introducing much overhead.
keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
// dialTimeout is the timeout for failing to establish a connection.
// It is set to 20 seconds as times shorter than that will cause TLS connections to fail
// on heavily loaded arm64 CPUs (issue #64649)
dialTimeout = 20 * time.Second
dbMetricsMonitorJitter = 0.5
)
// TODO(negz): Stop using a package scoped logger. At the time of writing we're
// creating an etcd client for each CRD. We need to pass each etcd client a
// logger or each client will create its own, which comes with a significant
// memory cost (around 20% of the API server's memory when hundreds of CRDs are
// present). The correct fix here is to not create a client per CRD. See
// https://github.com/kubernetes/kubernetes/issues/111476 for more.
var etcd3ClientLogger *zap.Logger
func init() {
// grpcprom auto-registers (via an init function) their client metrics, since we are opting out of
// using the global prometheus registry and using our own wrapped global registry,
// we need to explicitly register these metrics to our global registry here.
// For reference: https://github.com/kubernetes/kubernetes/pull/81387
legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics)
dbMetricsMonitors = make(map[string]struct{})
l, err := logutil.CreateDefaultZapLogger(etcdClientDebugLevel())
if err != nil {
l = zap.NewNop()
}
etcd3ClientLogger = l.Named("etcd-client")
}
// etcdClientDebugLevel translates ETCD_CLIENT_DEBUG into zap log level.
// NOTE(negz): This is a copy of a private etcd client function:
// https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/logger.go#L47
func etcdClientDebugLevel() zapcore.Level {
envLevel := os.Getenv("ETCD_CLIENT_DEBUG")
if envLevel == "" || envLevel == "true" {
return zapcore.InfoLevel
}
var l zapcore.Level
if err := l.Set(envLevel); err == nil {
log.Printf("Deprecated env ETCD_CLIENT_DEBUG value. Using default level: 'info'")
return zapcore.InfoLevel
}
return l
}
func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
timeout := storagebackend.DefaultHealthcheckTimeout
if c.HealthcheckTimeout != time.Duration(0) {
timeout = c.HealthcheckTimeout
}
return newETCD3Check(c, timeout, stopCh)
}
func newETCD3ReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
timeout := storagebackend.DefaultReadinessTimeout
if c.ReadycheckTimeout != time.Duration(0) {
timeout = c.ReadycheckTimeout
}
return newETCD3Check(c, timeout, stopCh)
}
// atomic error acts as a cache for atomically store an error
// the error is only updated if the timestamp is more recent than
// current stored error.
type atomicLastError struct {
mu sync.RWMutex
err error
timestamp time.Time
}
func (a *atomicLastError) Store(err error, t time.Time) {
a.mu.Lock()
defer a.mu.Unlock()
if a.timestamp.IsZero() || a.timestamp.Before(t) {
a.err = err
a.timestamp = t
}
}
func (a *atomicLastError) Load() error {
a.mu.RLock()
defer a.mu.RUnlock()
return a.err
}
func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan struct{}) (func() error, error) {
// constructing the etcd v3 client blocks and times out if etcd is not available.
// retry in a loop in the background until we successfully create the client, storing the client or error encountered
lock := sync.RWMutex{}
var client *clientv3.Client
clientErr := fmt.Errorf("etcd client connection not yet established")
go wait.PollUntil(time.Second, func() (bool, error) {
newClient, err := newETCD3Client(c.Transport)
lock.Lock()
defer lock.Unlock()
// Ensure that server is already not shutting down.
select {
case <-stopCh:
if err == nil {
newClient.Close()
}
return true, nil
default:
}
if err != nil {
clientErr = err
return false, nil
}
client = newClient
clientErr = nil
return true, nil
}, stopCh)
// Close the client on shutdown.
go func() {
defer utilruntime.HandleCrash()
<-stopCh
lock.Lock()
defer lock.Unlock()
if client != nil {
client.Close()
clientErr = fmt.Errorf("server is shutting down")
}
}()
// limit to a request every half of the configured timeout with a maximum burst of one
// rate limited requests will receive the last request sent error (note: not the last received response)
limiter := rate.NewLimiter(rate.Every(timeout/2), 1)
// initial state is the clientErr
lastError := &atomicLastError{err: fmt.Errorf("etcd client connection not yet established")}
return func() error {
// Given that client is closed on shutdown we hold the lock for
// the entire period of healthcheck call to ensure that client will
// not be closed during healthcheck.
// Given that healthchecks has a 2s timeout, worst case of blocking
// shutdown for additional 2s seems acceptable.
lock.RLock()
defer lock.RUnlock()
if clientErr != nil {
return clientErr
}
if limiter.Allow() == false {
return lastError.Load()
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
now := time.Now()
_, err := client.Get(ctx, path.Join("/", c.Prefix, "health"))
if err != nil {
err = fmt.Errorf("error getting data from etcd: %w", err)
}
lastError.Store(err, now)
return err
}, nil
}
var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
tlsInfo := transport.TLSInfo{
CertFile: c.CertFile,
KeyFile: c.KeyFile,
TrustedCAFile: c.TrustedCAFile,
}
tlsConfig, err := tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
// NOTE: Client relies on nil tlsConfig
// for non-secure connections, update the implicit variable
if len(c.CertFile) == 0 && len(c.KeyFile) == 0 && len(c.TrustedCAFile) == 0 {
tlsConfig = nil
}
networkContext := egressselector.Etcd.AsNetworkContext()
var egressDialer utilnet.DialFunc
if c.EgressLookup != nil {
egressDialer, err = c.EgressLookup(networkContext)
if err != nil {
return nil, err
}
}
dialOptions := []grpc.DialOption{
grpc.WithBlock(), // block until the underlying connection is up
// use chained interceptors so that the default (retry and backoff) interceptors are added.
// otherwise they will be overwritten by the metric interceptor.
//
// these optional interceptors will be placed after the default ones.
// which seems to be what we want as the metrics will be collected on each attempt (retry)
grpc.WithChainUnaryInterceptor(grpcprom.UnaryClientInterceptor),
grpc.WithChainStreamInterceptor(grpcprom.StreamClientInterceptor),
}
if utilfeature.DefaultFeatureGate.Enabled(genericfeatures.APIServerTracing) {
tracingOpts := []otelgrpc.Option{
otelgrpc.WithPropagators(tracing.Propagators()),
otelgrpc.WithTracerProvider(c.TracerProvider),
}
// Even with Noop TracerProvider, the otelgrpc still handles context propagation.
// See https://github.com/open-telemetry/opentelemetry-go/tree/main/example/passthrough
dialOptions = append(dialOptions,
grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor(tracingOpts...)),
grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor(tracingOpts...)))
}
if egressDialer != nil {
dialer := func(ctx context.Context, addr string) (net.Conn, error) {
if strings.Contains(addr, "//") {
// etcd client prior to 3.5 passed URLs to dialer, normalize to address
u, err := url.Parse(addr)
if err != nil {
return nil, err
}
addr = u.Host
}
return egressDialer(ctx, "tcp", addr)
}
dialOptions = append(dialOptions, grpc.WithContextDialer(dialer))
}
cfg := clientv3.Config{
DialTimeout: dialTimeout,
DialKeepAliveTime: keepaliveTime,
DialKeepAliveTimeout: keepaliveTimeout,
DialOptions: dialOptions,
Endpoints: c.ServerList,
TLS: tlsConfig,
Logger: etcd3ClientLogger,
}
return clientv3.New(cfg)
}
type runningCompactor struct {
interval time.Duration
cancel context.CancelFunc
client *clientv3.Client
refs int
}
var (
// compactorsMu guards access to compactors map
compactorsMu sync.Mutex
compactors = map[string]*runningCompactor{}
// dbMetricsMonitorsMu guards access to dbMetricsMonitors map
dbMetricsMonitorsMu sync.Mutex
dbMetricsMonitors map[string]struct{}
)
// startCompactorOnce start one compactor per transport. If the interval get smaller on repeated calls, the
// compactor is replaced. A destroy func is returned. If all destroy funcs with the same transport are called,
// the compactor is stopped.
func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration) (func(), error) {
compactorsMu.Lock()
defer compactorsMu.Unlock()
key := fmt.Sprintf("%v", c) // gives: {[server1 server2] keyFile certFile caFile}
if compactor, foundBefore := compactors[key]; !foundBefore || compactor.interval > interval {
compactorClient, err := newETCD3Client(c)
if err != nil {
return nil, err
}
if foundBefore {
// replace compactor
compactor.cancel()
compactor.client.Close()
} else {
// start new compactor
compactor = &runningCompactor{}
compactors[key] = compactor
}
ctx, cancel := context.WithCancel(context.Background())
compactor.interval = interval
compactor.cancel = cancel
compactor.client = compactorClient
etcd3.StartCompactor(ctx, compactorClient, interval)
}
compactors[key].refs++
return func() {
compactorsMu.Lock()
defer compactorsMu.Unlock()
compactor := compactors[key]
compactor.refs--
if compactor.refs == 0 {
compactor.cancel()
compactor.client.Close()
delete(compactors, key)
}
}, nil
}
func newETCD3Storage(c storagebackend.ConfigForResource, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
stopCompactor, err := startCompactorOnce(c.Transport, c.CompactionInterval)
if err != nil {
return nil, nil, err
}
client, err := newETCD3Client(c.Transport)
if err != nil {
stopCompactor()
return nil, nil, err
}
// decorate the KV instance so we can track etcd latency per request.
client.KV = etcd3.NewETCDLatencyTracker(client.KV)
stopDBSizeMonitor, err := startDBSizeMonitorPerEndpoint(client, c.DBMetricPollInterval)
if err != nil {
return nil, nil, err
}
var once sync.Once
destroyFunc := func() {
// we know that storage destroy funcs are called multiple times (due to reuse in subresources).
// Hence, we only destroy once.
// TODO: fix duplicated storage destroy calls higher level
once.Do(func() {
stopCompactor()
stopDBSizeMonitor()
client.Close()
})
}
transformer := c.Transformer
if transformer == nil {
transformer = identity.NewEncryptCheckTransformer()
}
return etcd3.New(client, c.Codec, newFunc, c.Prefix, c.GroupResource, transformer, c.Paging, c.LeaseManagerConfig), destroyFunc, nil
}
// startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the
// corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint.
func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) {
if interval == 0 {
return func() {}, nil
}
dbMetricsMonitorsMu.Lock()
defer dbMetricsMonitorsMu.Unlock()
ctx, cancel := context.WithCancel(context.Background())
for _, ep := range client.Endpoints() {
if _, found := dbMetricsMonitors[ep]; found {
continue
}
dbMetricsMonitors[ep] = struct{}{}
endpoint := ep
klog.V(4).Infof("Start monitoring storage db size metric for endpoint %s with polling interval %v", endpoint, interval)
go wait.JitterUntilWithContext(ctx, func(context.Context) {
epStatus, err := client.Maintenance.Status(ctx, endpoint)
if err != nil {
klog.V(4).Infof("Failed to get storage db size for ep %s: %v", endpoint, err)
metrics.UpdateEtcdDbSize(endpoint, -1)
} else {
metrics.UpdateEtcdDbSize(endpoint, epStatus.DbSize)
}
}, interval, dbMetricsMonitorJitter, true)
}
return func() {
cancel()
}, nil
}

View File

@ -0,0 +1,63 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package factory
import (
"fmt"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/storagebackend"
)
// DestroyFunc is to destroy any resources used by the storage returned in Create() together.
type DestroyFunc func()
// Create creates a storage backend based on given config.
func Create(c storagebackend.ConfigForResource, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3Storage(c, newFunc)
default:
return nil, nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
// CreateHealthCheck creates a healthcheck function based on given config.
func CreateHealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3HealthCheck(c, stopCh)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
func CreateReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3ReadyCheck(c, stopCh)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}

81
vendor/k8s.io/apiserver/pkg/storage/util.go generated vendored Normal file
View File

@ -0,0 +1,81 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package storage
import (
"fmt"
"sync/atomic"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/validation/path"
"k8s.io/apimachinery/pkg/runtime"
)
type SimpleUpdateFunc func(runtime.Object) (runtime.Object, error)
// SimpleUpdateFunc converts SimpleUpdateFunc into UpdateFunc
func SimpleUpdate(fn SimpleUpdateFunc) UpdateFunc {
return func(input runtime.Object, _ ResponseMeta) (runtime.Object, *uint64, error) {
out, err := fn(input)
return out, nil, err
}
}
func EverythingFunc(runtime.Object) bool {
return true
}
func NamespaceKeyFunc(prefix string, obj runtime.Object) (string, error) {
meta, err := meta.Accessor(obj)
if err != nil {
return "", err
}
name := meta.GetName()
if msgs := path.IsValidPathSegmentName(name); len(msgs) != 0 {
return "", fmt.Errorf("invalid name: %v", msgs)
}
return prefix + "/" + meta.GetNamespace() + "/" + name, nil
}
func NoNamespaceKeyFunc(prefix string, obj runtime.Object) (string, error) {
meta, err := meta.Accessor(obj)
if err != nil {
return "", err
}
name := meta.GetName()
if msgs := path.IsValidPathSegmentName(name); len(msgs) != 0 {
return "", fmt.Errorf("invalid name: %v", msgs)
}
return prefix + "/" + name, nil
}
// HighWaterMark is a thread-safe object for tracking the maximum value seen
// for some quantity.
type HighWaterMark int64
// Update returns true if and only if 'current' is the highest value ever seen.
func (hwm *HighWaterMark) Update(current int64) bool {
for {
old := atomic.LoadInt64((*int64)(hwm))
if current <= old {
return false
}
if atomic.CompareAndSwapInt64((*int64)(hwm), old, current) {
return true
}
}
}

8
vendor/k8s.io/apiserver/pkg/storage/value/OWNERS generated vendored Normal file
View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-auth-encryption-at-rest-approvers
reviewers:
- sig-auth-encryption-at-rest-reviewers
labels:
- sig/auth

View File

@ -0,0 +1,153 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package aes transforms values for storage at rest using AES-GCM.
package aes
import (
"bytes"
"context"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"errors"
"fmt"
"io"
"k8s.io/apiserver/pkg/storage/value"
)
// gcm implements AEAD encryption of the provided values given a cipher.Block algorithm.
// The authenticated data provided as part of the value.Context method must match when the same
// value is set to and loaded from storage. In order to ensure that values cannot be copied by
// an attacker from a location under their control, use characteristics of the storage location
// (such as the etcd key) as part of the authenticated data.
//
// Because this mode requires a generated IV and IV reuse is a known weakness of AES-GCM, keys
// must be rotated before a birthday attack becomes feasible. NIST SP 800-38D
// (http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf) recommends using the same
// key with random 96-bit nonces (the default nonce length) no more than 2^32 times, and
// therefore transformers using this implementation *must* ensure they allow for frequent key
// rotation. Future work should include investigation of AES-GCM-SIV as an alternative to
// random nonces.
type gcm struct {
block cipher.Block
}
// NewGCMTransformer takes the given block cipher and performs encryption and decryption on the given
// data.
func NewGCMTransformer(block cipher.Block) value.Transformer {
return &gcm{block: block}
}
func (t *gcm) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
aead, err := cipher.NewGCM(t.block)
if err != nil {
return nil, false, err
}
nonceSize := aead.NonceSize()
if len(data) < nonceSize {
return nil, false, fmt.Errorf("the stored data was shorter than the required size")
}
result, err := aead.Open(nil, data[:nonceSize], data[nonceSize:], dataCtx.AuthenticatedData())
return result, false, err
}
func (t *gcm) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
aead, err := cipher.NewGCM(t.block)
if err != nil {
return nil, err
}
nonceSize := aead.NonceSize()
result := make([]byte, nonceSize+aead.Overhead()+len(data))
n, err := rand.Read(result[:nonceSize])
if err != nil {
return nil, err
}
if n != nonceSize {
return nil, fmt.Errorf("unable to read sufficient random bytes")
}
cipherText := aead.Seal(result[nonceSize:nonceSize], result[:nonceSize], data, dataCtx.AuthenticatedData())
return result[:nonceSize+len(cipherText)], nil
}
// cbc implements encryption at rest of the provided values given a cipher.Block algorithm.
type cbc struct {
block cipher.Block
}
// NewCBCTransformer takes the given block cipher and performs encryption and decryption on the given
// data.
func NewCBCTransformer(block cipher.Block) value.Transformer {
return &cbc{block: block}
}
var (
ErrInvalidBlockSize = fmt.Errorf("the stored data is not a multiple of the block size")
errInvalidPKCS7Data = errors.New("invalid PKCS7 data (empty or not padded)")
errInvalidPKCS7Padding = errors.New("invalid padding on input")
)
func (t *cbc) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
blockSize := aes.BlockSize
if len(data) < blockSize {
return nil, false, fmt.Errorf("the stored data was shorter than the required size")
}
iv := data[:blockSize]
data = data[blockSize:]
if len(data)%blockSize != 0 {
return nil, false, ErrInvalidBlockSize
}
result := make([]byte, len(data))
copy(result, data)
mode := cipher.NewCBCDecrypter(t.block, iv)
mode.CryptBlocks(result, result)
// remove and verify PKCS#7 padding for CBC
c := result[len(result)-1]
paddingSize := int(c)
size := len(result) - paddingSize
if paddingSize == 0 || paddingSize > len(result) {
return nil, false, errInvalidPKCS7Data
}
for i := 0; i < paddingSize; i++ {
if result[size+i] != c {
return nil, false, errInvalidPKCS7Padding
}
}
return result[:size], false, nil
}
func (t *cbc) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
blockSize := aes.BlockSize
paddingSize := blockSize - (len(data) % blockSize)
result := make([]byte, blockSize+len(data)+paddingSize)
iv := result[:blockSize]
if _, err := io.ReadFull(rand.Reader, iv); err != nil {
return nil, fmt.Errorf("unable to read sufficient random bytes")
}
copy(result[blockSize:], data)
// add PKCS#7 padding for CBC
copy(result[blockSize+len(data):], bytes.Repeat([]byte{byte(paddingSize)}, paddingSize))
mode := cipher.NewCBCEncrypter(t.block, iv)
mode.CryptBlocks(result[blockSize:], result[blockSize:])
return result, nil
}

View File

@ -0,0 +1,198 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package envelope transforms values for storage at rest using a Envelope provider
package envelope
import (
"context"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"encoding/base64"
"fmt"
"time"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/metrics"
"k8s.io/utils/lru"
"golang.org/x/crypto/cryptobyte"
)
func init() {
value.RegisterMetrics()
metrics.RegisterMetrics()
}
// Service allows encrypting and decrypting data using an external Key Management Service.
type Service interface {
// Decrypt a given bytearray to obtain the original data as bytes.
Decrypt(data []byte) ([]byte, error)
// Encrypt bytes to a ciphertext.
Encrypt(data []byte) ([]byte, error)
}
type envelopeTransformer struct {
envelopeService Service
// transformers is a thread-safe LRU cache which caches decrypted DEKs indexed by their encrypted form.
transformers *lru.Cache
// baseTransformerFunc creates a new transformer for encrypting the data with the DEK.
baseTransformerFunc func(cipher.Block) value.Transformer
cacheSize int
cacheEnabled bool
}
// NewEnvelopeTransformer returns a transformer which implements a KEK-DEK based envelope encryption scheme.
// It uses envelopeService to encrypt and decrypt DEKs. Respective DEKs (in encrypted form) are prepended to
// the data items they encrypt. A cache (of size cacheSize) is maintained to store the most recently
// used decrypted DEKs in memory.
func NewEnvelopeTransformer(envelopeService Service, cacheSize int, baseTransformerFunc func(cipher.Block) value.Transformer) value.Transformer {
var (
cache *lru.Cache
)
if cacheSize > 0 {
cache = lru.New(cacheSize)
}
return &envelopeTransformer{
envelopeService: envelopeService,
transformers: cache,
baseTransformerFunc: baseTransformerFunc,
cacheEnabled: cacheSize > 0,
cacheSize: cacheSize,
}
}
// TransformFromStorage decrypts data encrypted by this transformer using envelope encryption.
func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
metrics.RecordArrival(metrics.FromStorageLabel, time.Now())
// Read the 16 bit length-of-DEK encoded at the start of the encrypted DEK. 16 bits can
// represent a maximum key length of 65536 bytes. We are using a 256 bit key, whose
// length cannot fit in 8 bits (1 byte). Thus, we use 16 bits (2 bytes) to store the length.
var encKey cryptobyte.String
s := cryptobyte.String(data)
if ok := s.ReadUint16LengthPrefixed(&encKey); !ok {
return nil, false, fmt.Errorf("invalid data encountered by envelope transformer: failed to read uint16 length prefixed data")
}
encData := []byte(s)
// Look up the decrypted DEK from cache or Envelope.
transformer := t.getTransformer(encKey)
if transformer == nil {
if t.cacheEnabled {
value.RecordCacheMiss()
}
key, err := t.envelopeService.Decrypt(encKey)
if err != nil {
// Do NOT wrap this err using fmt.Errorf() or similar functions
// because this gRPC status error has useful error code when
// record the metric.
return nil, false, err
}
transformer, err = t.addTransformer(encKey, key)
if err != nil {
return nil, false, err
}
}
return transformer.TransformFromStorage(ctx, encData, dataCtx)
}
// TransformToStorage encrypts data to be written to disk using envelope encryption.
func (t *envelopeTransformer) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
metrics.RecordArrival(metrics.ToStorageLabel, time.Now())
newKey, err := generateKey(32)
if err != nil {
return nil, err
}
encKey, err := t.envelopeService.Encrypt(newKey)
if err != nil {
// Do NOT wrap this err using fmt.Errorf() or similar functions
// because this gRPC status error has useful error code when
// record the metric.
return nil, err
}
transformer, err := t.addTransformer(encKey, newKey)
if err != nil {
return nil, err
}
result, err := transformer.TransformToStorage(ctx, data, dataCtx)
if err != nil {
return nil, err
}
// Append the length of the encrypted DEK as the first 2 bytes.
b := cryptobyte.NewBuilder(nil)
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes([]byte(encKey))
})
b.AddBytes(result)
return b.Bytes()
}
var _ value.Transformer = &envelopeTransformer{}
// addTransformer inserts a new transformer to the Envelope cache of DEKs for future reads.
func (t *envelopeTransformer) addTransformer(encKey []byte, key []byte) (value.Transformer, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
transformer := t.baseTransformerFunc(block)
// Use base64 of encKey as the key into the cache because hashicorp/golang-lru
// cannot hash []uint8.
if t.cacheEnabled {
t.transformers.Add(base64.StdEncoding.EncodeToString(encKey), transformer)
metrics.RecordDekCacheFillPercent(float64(t.transformers.Len()) / float64(t.cacheSize))
}
return transformer, nil
}
// getTransformer fetches the transformer corresponding to encKey from cache, if it exists.
func (t *envelopeTransformer) getTransformer(encKey []byte) value.Transformer {
if !t.cacheEnabled {
return nil
}
_transformer, found := t.transformers.Get(base64.StdEncoding.EncodeToString(encKey))
if found {
return _transformer.(value.Transformer)
}
return nil
}
// generateKey generates a random key using system randomness.
func generateKey(length int) (key []byte, err error) {
defer func(start time.Time) {
value.RecordDataKeyGeneration(start, err)
}(time.Now())
key = make([]byte, length)
if _, err = rand.Read(key); err != nil {
return nil, err
}
return key, nil
}

View File

@ -0,0 +1,162 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package envelope transforms values for storage at rest using a Envelope provider
package envelope
import (
"context"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/util"
"k8s.io/klog/v2"
kmsapi "k8s.io/kms/apis/v1beta1"
)
const (
// unixProtocol is the only supported protocol for remote KMS provider.
unixProtocol = "unix"
// Current version for the protocol interface definition.
kmsapiVersion = "v1beta1"
versionErrorf = "KMS provider api version %s is not supported, only %s is supported now"
)
// The gRPC implementation for envelope.Service.
type gRPCService struct {
kmsClient kmsapi.KeyManagementServiceClient
connection *grpc.ClientConn
callTimeout time.Duration
mux sync.RWMutex
versionChecked bool
}
// NewGRPCService returns an envelope.Service which use gRPC to communicate the remote KMS provider.
func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Duration) (Service, error) {
klog.V(4).Infof("Configure KMS provider with endpoint: %s", endpoint)
addr, err := util.ParseEndpoint(endpoint)
if err != nil {
return nil, err
}
s := &gRPCService{callTimeout: callTimeout}
s.connection, err = grpc.Dial(
addr,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithUnaryInterceptor(s.interceptor),
grpc.WithDefaultCallOptions(grpc.WaitForReady(true)),
grpc.WithContextDialer(
func(context.Context, string) (net.Conn, error) {
// Ignoring addr and timeout arguments:
// addr - comes from the closure
c, err := net.DialUnix(unixProtocol, nil, &net.UnixAddr{Name: addr})
if err != nil {
klog.Errorf("failed to create connection to unix socket: %s, error: %v", addr, err)
} else {
klog.V(4).Infof("Successfully dialed Unix socket %v", addr)
}
return c, err
}))
if err != nil {
return nil, fmt.Errorf("failed to create connection to %s, error: %v", endpoint, err)
}
s.kmsClient = kmsapi.NewKeyManagementServiceClient(s.connection)
go func() {
defer utilruntime.HandleCrash()
<-ctx.Done()
_ = s.connection.Close()
}()
return s, nil
}
func (g *gRPCService) checkAPIVersion(ctx context.Context) error {
g.mux.Lock()
defer g.mux.Unlock()
if g.versionChecked {
return nil
}
request := &kmsapi.VersionRequest{Version: kmsapiVersion}
response, err := g.kmsClient.Version(ctx, request)
if err != nil {
return fmt.Errorf("failed get version from remote KMS provider: %v", err)
}
if response.Version != kmsapiVersion {
return fmt.Errorf(versionErrorf, response.Version, kmsapiVersion)
}
g.versionChecked = true
klog.V(4).Infof("Version of KMS provider is %s", response.Version)
return nil
}
// Decrypt a given data string to obtain the original byte data.
func (g *gRPCService) Decrypt(cipher []byte) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), g.callTimeout)
defer cancel()
request := &kmsapi.DecryptRequest{Cipher: cipher, Version: kmsapiVersion}
response, err := g.kmsClient.Decrypt(ctx, request)
if err != nil {
return nil, err
}
return response.Plain, nil
}
// Encrypt bytes to a string ciphertext.
func (g *gRPCService) Encrypt(plain []byte) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), g.callTimeout)
defer cancel()
request := &kmsapi.EncryptRequest{Plain: plain, Version: kmsapiVersion}
response, err := g.kmsClient.Encrypt(ctx, request)
if err != nil {
return nil, err
}
return response.Cipher, nil
}
func (g *gRPCService) interceptor(
ctx context.Context,
method string,
req interface{},
reply interface{},
cc *grpc.ClientConn,
invoker grpc.UnaryInvoker,
opts ...grpc.CallOption,
) error {
if !kmsapi.IsVersionCheckMethod(method) {
if err := g.checkAPIVersion(ctx); err != nil {
return err
}
}
return invoker(ctx, method, req, reply, cc, opts...)
}

View File

@ -0,0 +1,315 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kmsv2 transforms values for storage at rest using a Envelope v2 provider
package kmsv2
import (
"context"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"encoding/base64"
"fmt"
"time"
"github.com/gogo/protobuf/proto"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/validation"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/apiserver/pkg/storage/value"
kmstypes "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2/v2alpha1"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/metrics"
"k8s.io/klog/v2"
"k8s.io/utils/lru"
)
const (
// KMSAPIVersion is the version of the KMS API.
KMSAPIVersion = "v2alpha1"
// annotationsMaxSize is the maximum size of the annotations.
annotationsMaxSize = 32 * 1024 // 32 kB
// keyIDMaxSize is the maximum size of the keyID.
keyIDMaxSize = 1 * 1024 // 1 kB
// encryptedDEKMaxSize is the maximum size of the encrypted DEK.
encryptedDEKMaxSize = 1 * 1024 // 1 kB
)
// Service allows encrypting and decrypting data using an external Key Management Service.
type Service interface {
// Decrypt a given bytearray to obtain the original data as bytes.
Decrypt(ctx context.Context, uid string, req *DecryptRequest) ([]byte, error)
// Encrypt bytes to a ciphertext.
Encrypt(ctx context.Context, uid string, data []byte) (*EncryptResponse, error)
// Status returns the status of the KMS.
Status(ctx context.Context) (*StatusResponse, error)
}
type envelopeTransformer struct {
envelopeService Service
// transformers is a thread-safe LRU cache which caches decrypted DEKs indexed by their encrypted form.
transformers *lru.Cache
// baseTransformerFunc creates a new transformer for encrypting the data with the DEK.
baseTransformerFunc func(cipher.Block) value.Transformer
cacheSize int
cacheEnabled bool
}
// EncryptResponse is the response from the Envelope service when encrypting data.
type EncryptResponse struct {
Ciphertext []byte
KeyID string
Annotations map[string][]byte
}
// DecryptRequest is the request to the Envelope service when decrypting data.
type DecryptRequest struct {
Ciphertext []byte
KeyID string
Annotations map[string][]byte
}
// StatusResponse is the response from the Envelope service when getting the status of the service.
type StatusResponse struct {
Version string
Healthz string
KeyID string
}
// NewEnvelopeTransformer returns a transformer which implements a KEK-DEK based envelope encryption scheme.
// It uses envelopeService to encrypt and decrypt DEKs. Respective DEKs (in encrypted form) are prepended to
// the data items they encrypt. A cache (of size cacheSize) is maintained to store the most recently
// used decrypted DEKs in memory.
func NewEnvelopeTransformer(envelopeService Service, cacheSize int, baseTransformerFunc func(cipher.Block) value.Transformer) value.Transformer {
var cache *lru.Cache
if cacheSize > 0 {
// TODO(aramase): Switch to using expiring cache: kubernetes/kubernetes/staging/src/k8s.io/apimachinery/pkg/util/cache/expiring.go.
// It handles scans a lot better, doesn't have to be right sized, and don't have a global lock on reads.
cache = lru.New(cacheSize)
}
return &envelopeTransformer{
envelopeService: envelopeService,
transformers: cache,
baseTransformerFunc: baseTransformerFunc,
cacheEnabled: cacheSize > 0,
cacheSize: cacheSize,
}
}
// TransformFromStorage decrypts data encrypted by this transformer using envelope encryption.
func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
metrics.RecordArrival(metrics.FromStorageLabel, time.Now())
// Deserialize the EncryptedObject from the data.
encryptedObject, err := t.doDecode(data)
if err != nil {
return nil, false, err
}
// Look up the decrypted DEK from cache or Envelope.
transformer := t.getTransformer(encryptedObject.EncryptedDEK)
if transformer == nil {
if t.cacheEnabled {
value.RecordCacheMiss()
}
uid := string(uuid.NewUUID())
klog.V(6).InfoS("Decrypting content using envelope service", "uid", uid, "key", string(dataCtx.AuthenticatedData()))
key, err := t.envelopeService.Decrypt(ctx, uid, &DecryptRequest{
Ciphertext: encryptedObject.EncryptedDEK,
KeyID: encryptedObject.KeyID,
Annotations: encryptedObject.Annotations,
})
if err != nil {
return nil, false, fmt.Errorf("failed to decrypt DEK, error: %w", err)
}
transformer, err = t.addTransformer(encryptedObject.EncryptedDEK, key)
if err != nil {
return nil, false, err
}
}
return transformer.TransformFromStorage(ctx, encryptedObject.EncryptedData, dataCtx)
}
// TransformToStorage encrypts data to be written to disk using envelope encryption.
func (t *envelopeTransformer) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
metrics.RecordArrival(metrics.ToStorageLabel, time.Now())
newKey, err := generateKey(32)
if err != nil {
return nil, err
}
uid := string(uuid.NewUUID())
klog.V(6).InfoS("Encrypting content using envelope service", "uid", uid, "key", string(dataCtx.AuthenticatedData()))
resp, err := t.envelopeService.Encrypt(ctx, uid, newKey)
if err != nil {
return nil, fmt.Errorf("failed to encrypt DEK, error: %w", err)
}
transformer, err := t.addTransformer(resp.Ciphertext, newKey)
if err != nil {
return nil, err
}
result, err := transformer.TransformToStorage(ctx, data, dataCtx)
if err != nil {
return nil, err
}
encObject := &kmstypes.EncryptedObject{
KeyID: resp.KeyID,
EncryptedDEK: resp.Ciphertext,
EncryptedData: result,
Annotations: resp.Annotations,
}
// Serialize the EncryptedObject to a byte array.
return t.doEncode(encObject)
}
// addTransformer inserts a new transformer to the Envelope cache of DEKs for future reads.
func (t *envelopeTransformer) addTransformer(encKey []byte, key []byte) (value.Transformer, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
transformer := t.baseTransformerFunc(block)
// Use base64 of encKey as the key into the cache because hashicorp/golang-lru
// cannot hash []uint8.
if t.cacheEnabled {
t.transformers.Add(base64.StdEncoding.EncodeToString(encKey), transformer)
metrics.RecordDekCacheFillPercent(float64(t.transformers.Len()) / float64(t.cacheSize))
}
return transformer, nil
}
// getTransformer fetches the transformer corresponding to encKey from cache, if it exists.
func (t *envelopeTransformer) getTransformer(encKey []byte) value.Transformer {
if !t.cacheEnabled {
return nil
}
_transformer, found := t.transformers.Get(base64.StdEncoding.EncodeToString(encKey))
if found {
return _transformer.(value.Transformer)
}
return nil
}
// doEncode encodes the EncryptedObject to a byte array.
func (t *envelopeTransformer) doEncode(request *kmstypes.EncryptedObject) ([]byte, error) {
if err := validateEncryptedObject(request); err != nil {
return nil, err
}
return proto.Marshal(request)
}
// doDecode decodes the byte array to an EncryptedObject.
func (t *envelopeTransformer) doDecode(originalData []byte) (*kmstypes.EncryptedObject, error) {
o := &kmstypes.EncryptedObject{}
if err := proto.Unmarshal(originalData, o); err != nil {
return nil, err
}
// validate the EncryptedObject
if err := validateEncryptedObject(o); err != nil {
return nil, err
}
return o, nil
}
// generateKey generates a random key using system randomness.
func generateKey(length int) (key []byte, err error) {
defer func(start time.Time) {
value.RecordDataKeyGeneration(start, err)
}(time.Now())
key = make([]byte, length)
if _, err = rand.Read(key); err != nil {
return nil, err
}
return key, nil
}
func validateEncryptedObject(o *kmstypes.EncryptedObject) error {
if o == nil {
return fmt.Errorf("encrypted object is nil")
}
if len(o.EncryptedData) == 0 {
return fmt.Errorf("encrypted data is empty")
}
if err := validateEncryptedDEK(o.EncryptedDEK); err != nil {
return fmt.Errorf("failed to validate encrypted DEK: %w", err)
}
if err := validateKeyID(o.KeyID); err != nil {
return fmt.Errorf("failed to validate key id: %w", err)
}
if err := validateAnnotations(o.Annotations); err != nil {
return fmt.Errorf("failed to validate annotations: %w", err)
}
return nil
}
// validateEncryptedDEK tests the following:
// 1. The encrypted DEK is not empty.
// 2. The size of encrypted DEK is less than 1 kB.
func validateEncryptedDEK(encryptedDEK []byte) error {
if len(encryptedDEK) == 0 {
return fmt.Errorf("encrypted DEK is empty")
}
if len(encryptedDEK) > encryptedDEKMaxSize {
return fmt.Errorf("encrypted DEK is %d bytes, which exceeds the max size of %d", len(encryptedDEK), encryptedDEKMaxSize)
}
return nil
}
// validateAnnotations tests the following:
// 1. checks if the annotation key is fully qualified
// 2. The size of annotations keys + values is less than 32 kB.
func validateAnnotations(annotations map[string][]byte) error {
var errs []error
var totalSize uint64
for k, v := range annotations {
if fieldErr := validation.IsFullyQualifiedDomainName(field.NewPath("annotations"), k); fieldErr != nil {
errs = append(errs, fieldErr.ToAggregate())
}
totalSize += uint64(len(k)) + uint64(len(v))
}
if totalSize > annotationsMaxSize {
errs = append(errs, fmt.Errorf("total size of annotations is %d, which exceeds the max size of %d", totalSize, annotationsMaxSize))
}
return utilerrors.NewAggregate(errs)
}
// validateKeyID tests the following:
// 1. The keyID is not empty.
// 2. The size of keyID is less than 1 kB.
func validateKeyID(keyID string) error {
if len(keyID) == 0 {
return fmt.Errorf("keyID is empty")
}
if len(keyID) > keyIDMaxSize {
return fmt.Errorf("keyID is %d bytes, which exceeds the max size of %d", len(keyID), keyIDMaxSize)
}
return nil
}

View File

@ -0,0 +1,139 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kmsv2 transforms values for storage at rest using a Envelope provider
package kmsv2
import (
"context"
"fmt"
"net"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/util"
"k8s.io/klog/v2"
kmsapi "k8s.io/kms/apis/v2alpha1"
)
const (
// unixProtocol is the only supported protocol for remote KMS provider.
unixProtocol = "unix"
)
// The gRPC implementation for envelope.Service.
type gRPCService struct {
kmsClient kmsapi.KeyManagementServiceClient
connection *grpc.ClientConn
callTimeout time.Duration
}
// NewGRPCService returns an envelope.Service which use gRPC to communicate the remote KMS provider.
func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Duration) (Service, error) {
klog.V(4).Infof("Configure KMS provider with endpoint: %s", endpoint)
addr, err := util.ParseEndpoint(endpoint)
if err != nil {
return nil, err
}
s := &gRPCService{callTimeout: callTimeout}
s.connection, err = grpc.Dial(
addr,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithDefaultCallOptions(grpc.WaitForReady(true)),
grpc.WithContextDialer(
func(context.Context, string) (net.Conn, error) {
// Ignoring addr and timeout arguments:
// addr - comes from the closure
c, err := net.DialUnix(unixProtocol, nil, &net.UnixAddr{Name: addr})
if err != nil {
klog.Errorf("failed to create connection to unix socket: %s, error: %v", addr, err)
} else {
klog.V(4).Infof("Successfully dialed Unix socket %v", addr)
}
return c, err
}))
if err != nil {
return nil, fmt.Errorf("failed to create connection to %s, error: %v", endpoint, err)
}
s.kmsClient = kmsapi.NewKeyManagementServiceClient(s.connection)
go func() {
defer utilruntime.HandleCrash()
<-ctx.Done()
_ = s.connection.Close()
}()
return s, nil
}
// Decrypt a given data string to obtain the original byte data.
func (g *gRPCService) Decrypt(ctx context.Context, uid string, req *DecryptRequest) ([]byte, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
request := &kmsapi.DecryptRequest{
Ciphertext: req.Ciphertext,
Uid: uid,
KeyId: req.KeyID,
Annotations: req.Annotations,
}
response, err := g.kmsClient.Decrypt(ctx, request)
if err != nil {
return nil, err
}
return response.Plaintext, nil
}
// Encrypt bytes to a string ciphertext.
func (g *gRPCService) Encrypt(ctx context.Context, uid string, plaintext []byte) (*EncryptResponse, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
request := &kmsapi.EncryptRequest{
Plaintext: plaintext,
Uid: uid,
}
response, err := g.kmsClient.Encrypt(ctx, request)
if err != nil {
return nil, err
}
return &EncryptResponse{
Ciphertext: response.Ciphertext,
KeyID: response.KeyId,
Annotations: response.Annotations,
}, nil
}
// Status returns the status of the KMSv2 provider.
func (g *gRPCService) Status(ctx context.Context) (*StatusResponse, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
request := &kmsapi.StatusRequest{}
response, err := g.kmsClient.Status(ctx, request)
if err != nil {
return nil, err
}
return &StatusResponse{Version: response.Version, Healthz: response.Healthz, KeyID: response.KeyId}, nil
}

View File

@ -0,0 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
# Disable inheritance as this is an api owners file
options:
no_parent_owners: true
approvers:
- api-approvers
reviewers:
- sig-auth-api-reviewers

View File

@ -0,0 +1,128 @@
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: api.proto
package v2alpha1
import (
fmt "fmt"
proto "github.com/gogo/protobuf/proto"
math "math"
)
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
type EncryptedObject struct {
// EncryptedData is the encrypted data.
EncryptedData []byte `protobuf:"bytes,1,opt,name=encryptedData,proto3" json:"encryptedData,omitempty"`
// KeyID is the KMS key ID used for encryption operations.
KeyID string `protobuf:"bytes,2,opt,name=keyID,proto3" json:"keyID,omitempty"`
// EncryptedDEK is the encrypted DEK.
EncryptedDEK []byte `protobuf:"bytes,3,opt,name=encryptedDEK,proto3" json:"encryptedDEK,omitempty"`
// Annotations is additional metadata that was provided by the KMS plugin.
Annotations map[string][]byte `protobuf:"bytes,4,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *EncryptedObject) Reset() { *m = EncryptedObject{} }
func (m *EncryptedObject) String() string { return proto.CompactTextString(m) }
func (*EncryptedObject) ProtoMessage() {}
func (*EncryptedObject) Descriptor() ([]byte, []int) {
return fileDescriptor_00212fb1f9d3bf1c, []int{0}
}
func (m *EncryptedObject) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_EncryptedObject.Unmarshal(m, b)
}
func (m *EncryptedObject) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_EncryptedObject.Marshal(b, m, deterministic)
}
func (m *EncryptedObject) XXX_Merge(src proto.Message) {
xxx_messageInfo_EncryptedObject.Merge(m, src)
}
func (m *EncryptedObject) XXX_Size() int {
return xxx_messageInfo_EncryptedObject.Size(m)
}
func (m *EncryptedObject) XXX_DiscardUnknown() {
xxx_messageInfo_EncryptedObject.DiscardUnknown(m)
}
var xxx_messageInfo_EncryptedObject proto.InternalMessageInfo
func (m *EncryptedObject) GetEncryptedData() []byte {
if m != nil {
return m.EncryptedData
}
return nil
}
func (m *EncryptedObject) GetKeyID() string {
if m != nil {
return m.KeyID
}
return ""
}
func (m *EncryptedObject) GetEncryptedDEK() []byte {
if m != nil {
return m.EncryptedDEK
}
return nil
}
func (m *EncryptedObject) GetAnnotations() map[string][]byte {
if m != nil {
return m.Annotations
}
return nil
}
func init() {
proto.RegisterType((*EncryptedObject)(nil), "v2alpha1.EncryptedObject")
proto.RegisterMapType((map[string][]byte)(nil), "v2alpha1.EncryptedObject.AnnotationsEntry")
}
func init() { proto.RegisterFile("api.proto", fileDescriptor_00212fb1f9d3bf1c) }
var fileDescriptor_00212fb1f9d3bf1c = []byte{
// 200 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4,
0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0xe2, 0x28, 0x33, 0x4a, 0xcc, 0x29, 0xc8, 0x48, 0x34, 0x54,
0xfa, 0xcf, 0xc8, 0xc5, 0xef, 0x9a, 0x97, 0x5c, 0x54, 0x59, 0x50, 0x92, 0x9a, 0xe2, 0x9f, 0x94,
0x95, 0x9a, 0x5c, 0x22, 0xa4, 0xc2, 0xc5, 0x9b, 0x0a, 0x13, 0x72, 0x49, 0x2c, 0x49, 0x94, 0x60,
0x54, 0x60, 0xd4, 0xe0, 0x09, 0x42, 0x15, 0x14, 0x12, 0xe1, 0x62, 0xcd, 0x4e, 0xad, 0xf4, 0x74,
0x91, 0x60, 0x52, 0x60, 0xd4, 0xe0, 0x0c, 0x82, 0x70, 0x84, 0x94, 0xb8, 0x78, 0x10, 0xca, 0x5c,
0xbd, 0x25, 0x98, 0xc1, 0x5a, 0x51, 0xc4, 0x84, 0x7c, 0xb8, 0xb8, 0x13, 0xf3, 0xf2, 0xf2, 0x4b,
0x12, 0x4b, 0x32, 0xf3, 0xf3, 0x8a, 0x25, 0x58, 0x14, 0x98, 0x35, 0xb8, 0x8d, 0xb4, 0xf4, 0x60,
0x6e, 0xd2, 0x43, 0x73, 0x8f, 0x9e, 0x23, 0x42, 0xb1, 0x6b, 0x5e, 0x49, 0x51, 0x65, 0x10, 0xb2,
0x76, 0x29, 0x3b, 0x2e, 0x01, 0x74, 0x05, 0x42, 0x02, 0x5c, 0xcc, 0xd9, 0xa9, 0x95, 0x60, 0x77,
0x73, 0x06, 0x81, 0x98, 0x20, 0xd7, 0x96, 0x25, 0xe6, 0x94, 0xa6, 0x82, 0x5d, 0xcb, 0x13, 0x04,
0xe1, 0x58, 0x31, 0x59, 0x30, 0x26, 0xb1, 0x81, 0x83, 0xc4, 0x18, 0x10, 0x00, 0x00, 0xff, 0xff,
0x88, 0x8c, 0xbb, 0x4e, 0x1f, 0x01, 0x00, 0x00,
}

View File

@ -0,0 +1,35 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// To regenerate api.pb.go run hack/update-generated-kms.sh
syntax = "proto3";
package v2alpha1;
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
message EncryptedObject {
// EncryptedData is the encrypted data.
bytes encryptedData = 1;
// KeyID is the KMS key ID used for encryption operations.
string keyID = 2;
// EncryptedDEK is the encrypted DEK.
bytes encryptedDEK = 3;
// Annotations is additional metadata that was provided by the KMS plugin.
map<string, bytes> annotations = 4;
}

View File

@ -0,0 +1,18 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package v2alpha1 contains definition of kms-plugin's serialized types.
package v2alpha1

View File

@ -0,0 +1,106 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
const (
namespace = "apiserver"
subsystem = "envelope_encryption"
FromStorageLabel = "from_storage"
ToStorageLabel = "to_storage"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
lockLastFromStorage sync.Mutex
lockLastToStorage sync.Mutex
lastFromStorage time.Time
lastToStorage time.Time
dekCacheFillPercent = metrics.NewGauge(
&metrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dek_cache_fill_percent",
Help: "Percent of the cache slots currently occupied by cached DEKs.",
StabilityLevel: metrics.ALPHA,
},
)
dekCacheInterArrivals = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dek_cache_inter_arrival_time_seconds",
Help: "Time (in seconds) of inter arrival of transformation requests.",
StabilityLevel: metrics.ALPHA,
Buckets: metrics.ExponentialBuckets(60, 2, 10),
},
[]string{"transformation_type"},
)
)
var registerMetricsFunc sync.Once
func RegisterMetrics() {
registerMetricsFunc.Do(func() {
legacyregistry.MustRegister(dekCacheFillPercent)
legacyregistry.MustRegister(dekCacheInterArrivals)
})
}
func RecordArrival(transformationType string, start time.Time) {
switch transformationType {
case FromStorageLabel:
lockLastFromStorage.Lock()
defer lockLastFromStorage.Unlock()
if lastFromStorage.IsZero() {
lastFromStorage = start
}
dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastFromStorage).Seconds())
lastFromStorage = start
case ToStorageLabel:
lockLastToStorage.Lock()
defer lockLastToStorage.Unlock()
if lastToStorage.IsZero() {
lastToStorage = start
}
dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastToStorage).Seconds())
lastToStorage = start
}
}
func RecordDekCacheFillPercent(percent float64) {
dekCacheFillPercent.Set(percent)
}

View File

@ -0,0 +1,54 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"net/url"
"strings"
)
const (
// unixProtocol is the only supported protocol for remote KMS provider.
unixProtocol = "unix"
)
// Parse the endpoint to extract schema, host or path.
func ParseEndpoint(endpoint string) (string, error) {
if len(endpoint) == 0 {
return "", fmt.Errorf("remote KMS provider can't use empty string as endpoint")
}
u, err := url.Parse(endpoint)
if err != nil {
return "", fmt.Errorf("invalid endpoint %q for remote KMS provider, error: %v", endpoint, err)
}
if u.Scheme != unixProtocol {
return "", fmt.Errorf("unsupported scheme %q for remote KMS provider", u.Scheme)
}
// Linux abstract namespace socket - no physical file required
// Warning: Linux Abstract sockets have not concept of ACL (unlike traditional file based sockets).
// However, Linux Abstract sockets are subject to Linux networking namespace, so will only be accessible to
// containers within the same pod (unless host networking is used).
if strings.HasPrefix(u.Path, "/@") {
return strings.TrimPrefix(u.Path, "/"), nil
}
return u.Path, nil
}

View File

@ -0,0 +1,57 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package identity
import (
"bytes"
"context"
"fmt"
"k8s.io/apiserver/pkg/storage/value"
)
var (
transformer = identityTransformer{}
encryptedPrefix = []byte("k8s:enc:")
errEncryptedData = fmt.Errorf("identity transformer tried to read encrypted data")
)
// identityTransformer performs no transformation on provided data, but validates
// that the data is not encrypted data during TransformFromStorage
type identityTransformer struct{}
// NewEncryptCheckTransformer returns an identityTransformer which returns an error
// on attempts to read encrypted data
func NewEncryptCheckTransformer() value.Transformer {
return transformer
}
// TransformFromStorage returns the input bytes if the data is not encrypted
func (identityTransformer) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
// identityTransformer has to return an error if the data is encoded using another transformer.
// JSON data starts with '{'. Protobuf data has a prefix 'k8s[\x00-\xFF]'.
// Prefix 'k8s:enc:' is reserved for encrypted data on disk.
if bytes.HasPrefix(data, encryptedPrefix) {
return nil, false, errEncryptedData
}
return data, false, nil
}
// TransformToStorage implements the Transformer interface for identityTransformer
func (identityTransformer) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
return data, nil
}

View File

@ -0,0 +1,70 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package secretbox transforms values for storage at rest using XSalsa20 and Poly1305.
package secretbox
import (
"context"
"crypto/rand"
"fmt"
"golang.org/x/crypto/nacl/secretbox"
"k8s.io/apiserver/pkg/storage/value"
)
// secretbox implements at rest encryption of the provided values given a 32 byte secret key.
// Uses a standard 24 byte nonce (placed at the beginning of the cipher text) generated
// from crypto/rand. Does not perform authentication of the data at rest.
type secretboxTransformer struct {
key [32]byte
}
const nonceSize = 24
// NewSecretboxTransformer takes the given key and performs encryption and decryption on the given
// data.
func NewSecretboxTransformer(key [32]byte) value.Transformer {
return &secretboxTransformer{key: key}
}
func (t *secretboxTransformer) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
if len(data) < (secretbox.Overhead + nonceSize) {
return nil, false, fmt.Errorf("the stored data was shorter than the required size")
}
var nonce [nonceSize]byte
copy(nonce[:], data[:nonceSize])
data = data[nonceSize:]
out := make([]byte, 0, len(data)-secretbox.Overhead)
result, ok := secretbox.Open(out, data, &nonce, &t.key)
if !ok {
return nil, false, fmt.Errorf("output array was not large enough for encryption")
}
return result, false, nil
}
func (t *secretboxTransformer) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
var nonce [nonceSize]byte
n, err := rand.Read(nonce[:])
if err != nil {
return nil, err
}
if n != nonceSize {
return nil, fmt.Errorf("unable to read sufficient random bytes")
}
return secretbox.Seal(nonce[:], data, &nonce, &t.key), nil
}

141
vendor/k8s.io/apiserver/pkg/storage/value/metrics.go generated vendored Normal file
View File

@ -0,0 +1,141 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package value
import (
"sync"
"time"
"google.golang.org/grpc/status"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
const (
namespace = "apiserver"
subsystem = "storage"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
transformerLatencies = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "transformation_duration_seconds",
Help: "Latencies in seconds of value transformation operations.",
// In-process transformations (ex. AES CBC) complete on the order of 20 microseconds. However, when
// external KMS is involved latencies may climb into hundreds of milliseconds.
Buckets: metrics.ExponentialBuckets(5e-6, 2, 25),
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type"},
)
transformerOperationsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "transformation_operations_total",
Help: "Total number of transformations.",
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type", "transformer_prefix", "status"},
)
envelopeTransformationCacheMissTotal = metrics.NewCounter(
&metrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "envelope_transformation_cache_misses_total",
Help: "Total number of cache misses while accessing key decryption key(KEK).",
StabilityLevel: metrics.ALPHA,
},
)
dataKeyGenerationLatencies = metrics.NewHistogram(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "data_key_generation_duration_seconds",
Help: "Latencies in seconds of data encryption key(DEK) generation operations.",
Buckets: metrics.ExponentialBuckets(5e-6, 2, 14),
StabilityLevel: metrics.ALPHA,
},
)
dataKeyGenerationFailuresTotal = metrics.NewCounter(
&metrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "data_key_generation_failures_total",
Help: "Total number of failed data encryption key(DEK) generation operations.",
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
func RegisterMetrics() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(transformerLatencies)
legacyregistry.MustRegister(transformerOperationsTotal)
legacyregistry.MustRegister(envelopeTransformationCacheMissTotal)
legacyregistry.MustRegister(dataKeyGenerationLatencies)
legacyregistry.MustRegister(dataKeyGenerationFailuresTotal)
})
}
// RecordTransformation records latencies and count of TransformFromStorage and TransformToStorage operations.
// Note that transformation_failures_total metric is deprecated, use transformation_operations_total instead.
func RecordTransformation(transformationType, transformerPrefix string, start time.Time, err error) {
transformerOperationsTotal.WithLabelValues(transformationType, transformerPrefix, status.Code(err).String()).Inc()
switch {
case err == nil:
transformerLatencies.WithLabelValues(transformationType).Observe(sinceInSeconds(start))
}
}
// RecordCacheMiss records a miss on Key Encryption Key(KEK) - call to KMS was required to decrypt KEK.
func RecordCacheMiss() {
envelopeTransformationCacheMissTotal.Inc()
}
// RecordDataKeyGeneration records latencies and count of Data Encryption Key generation operations.
func RecordDataKeyGeneration(start time.Time, err error) {
if err != nil {
dataKeyGenerationFailuresTotal.Inc()
return
}
dataKeyGenerationLatencies.Observe(sinceInSeconds(start))
}
// sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}

View File

@ -0,0 +1,166 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package value contains methods for assisting with transformation of values in storage.
package value
import (
"bytes"
"context"
"fmt"
"time"
"k8s.io/apimachinery/pkg/util/errors"
)
func init() {
RegisterMetrics()
}
// Context is additional information that a storage transformation may need to verify the data at rest.
type Context interface {
// AuthenticatedData should return an array of bytes that describes the current value. If the value changes,
// the transformer may report the value as unreadable or tampered. This may be nil if no such description exists
// or is needed. For additional verification, set this to data that strongly identifies the value, such as
// the key and creation version of the stored data.
AuthenticatedData() []byte
}
// Transformer allows a value to be transformed before being read from or written to the underlying store. The methods
// must be able to undo the transformation caused by the other.
type Transformer interface {
// TransformFromStorage may transform the provided data from its underlying storage representation or return an error.
// Stale is true if the object on disk is stale and a write to etcd should be issued, even if the contents of the object
// have not changed.
TransformFromStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, stale bool, err error)
// TransformToStorage may transform the provided data into the appropriate form in storage or return an error.
TransformToStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, err error)
}
// DefaultContext is a simple implementation of Context for a slice of bytes.
type DefaultContext []byte
// AuthenticatedData returns itself.
func (c DefaultContext) AuthenticatedData() []byte { return c }
// PrefixTransformer holds a transformer interface and the prefix that the transformation is located under.
type PrefixTransformer struct {
Prefix []byte
Transformer Transformer
}
type prefixTransformers struct {
transformers []PrefixTransformer
err error
}
var _ Transformer = &prefixTransformers{}
// NewPrefixTransformers supports the Transformer interface by checking the incoming data against the provided
// prefixes in order. The first matching prefix will be used to transform the value (the prefix is stripped
// before the Transformer interface is invoked). The first provided transformer will be used when writing to
// the store.
func NewPrefixTransformers(err error, transformers ...PrefixTransformer) Transformer {
if err == nil {
err = fmt.Errorf("the provided value does not match any of the supported transformers")
}
return &prefixTransformers{
transformers: transformers,
err: err,
}
}
// TransformFromStorage finds the first transformer with a prefix matching the provided data and returns
// the result of transforming the value. It will always mark any transformation as stale that is not using
// the first transformer.
func (t *prefixTransformers) TransformFromStorage(ctx context.Context, data []byte, dataCtx Context) ([]byte, bool, error) {
start := time.Now()
var errs []error
for i, transformer := range t.transformers {
if bytes.HasPrefix(data, transformer.Prefix) {
result, stale, err := transformer.Transformer.TransformFromStorage(ctx, data[len(transformer.Prefix):], dataCtx)
// To migrate away from encryption, user can specify an identity transformer higher up
// (in the config file) than the encryption transformer. In that scenario, the identity transformer needs to
// identify (during reads from disk) whether the data being read is encrypted or not. If the data is encrypted,
// it shall throw an error, but that error should not prevent the next subsequent transformer from being tried.
if len(transformer.Prefix) == 0 && err != nil {
continue
}
if len(transformer.Prefix) == 0 {
RecordTransformation("from_storage", "identity", start, err)
} else {
RecordTransformation("from_storage", string(transformer.Prefix), start, err)
}
// It is valid to have overlapping prefixes when the same encryption provider
// is specified multiple times but with different keys (the first provider is
// being rotated to and some later provider is being rotated away from).
//
// Example:
//
// {
// "aescbc": {
// "keys": [
// {
// "name": "2",
// "secret": "some key 2"
// }
// ]
// }
// },
// {
// "aescbc": {
// "keys": [
// {
// "name": "1",
// "secret": "some key 1"
// }
// ]
// }
// },
//
// The transformers for both aescbc configs share the prefix k8s:enc:aescbc:v1:
// but a failure in the first one should not prevent a later match from being attempted.
// Thus we never short-circuit on a prefix match that results in an error.
if err != nil {
errs = append(errs, err)
continue
}
return result, stale || i != 0, err
}
}
if err := errors.Reduce(errors.NewAggregate(errs)); err != nil {
return nil, false, err
}
RecordTransformation("from_storage", "unknown", start, t.err)
return nil, false, t.err
}
// TransformToStorage uses the first transformer and adds its prefix to the data.
func (t *prefixTransformers) TransformToStorage(ctx context.Context, data []byte, dataCtx Context) ([]byte, error) {
start := time.Now()
transformer := t.transformers[0]
result, err := transformer.Transformer.TransformToStorage(ctx, data, dataCtx)
RecordTransformation("to_storage", string(transformer.Prefix), start, err)
if err != nil {
return nil, err
}
prefixedData := make([]byte, len(transformer.Prefix), len(result)+len(transformer.Prefix))
copy(prefixedData, transformer.Prefix)
prefixedData = append(prefixedData, result...)
return prefixedData, nil
}