rebase: update kubernetes to 1.28.0 in main

updating kubernetes to 1.28.0
in the main repo.

Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
This commit is contained in:
Madhu Rajanna
2023-08-17 07:15:28 +02:00
committed by mergify[bot]
parent b2fdc269c3
commit ff3e84ad67
706 changed files with 45252 additions and 16346 deletions

View File

@ -1,11 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- lavalamp
- liggitt
- wojtek-t
reviewers:
- lavalamp
- smarterclayton
- wojtek-t
- deads2k
@ -16,6 +14,8 @@ reviewers:
- ingvagabund
- enj
- stevekuznetsov
- MadhavJivrajani
emeritus_approvers:
- xiang90
- timothysc
- lavalamp

View File

@ -104,7 +104,7 @@ type Config struct {
Codec runtime.Codec
Clock clock.Clock
Clock clock.WithTicker
}
type watchersMap map[int]*cacheWatcher
@ -184,7 +184,6 @@ func (i *indexedWatchers) terminateAll(groupResource schema.GroupResource, done
// second in a bucket, and pop up them once at the timeout. To be more specific,
// if you set fire time at X, you can get the bookmark within (X-1,X+1) period.
type watcherBookmarkTimeBuckets struct {
lock sync.Mutex
// the key of watcherBuckets is the number of seconds since createTime
watchersBuckets map[int64][]*cacheWatcher
createTime time.Time
@ -205,7 +204,7 @@ func newTimeBucketWatchers(clock clock.Clock, bookmarkFrequency time.Duration) *
// adds a watcher to the bucket, if the deadline is before the start, it will be
// added to the first one.
func (t *watcherBookmarkTimeBuckets) addWatcher(w *cacheWatcher) bool {
func (t *watcherBookmarkTimeBuckets) addWatcherThreadUnsafe(w *cacheWatcher) bool {
// note that the returned time can be before t.createTime,
// especially in cases when the nextBookmarkTime method
// give us the zero value of type Time
@ -215,8 +214,6 @@ func (t *watcherBookmarkTimeBuckets) addWatcher(w *cacheWatcher) bool {
return false
}
bucketID := int64(nextTime.Sub(t.createTime) / time.Second)
t.lock.Lock()
defer t.lock.Unlock()
if bucketID < t.startBucketID {
bucketID = t.startBucketID
}
@ -225,12 +222,10 @@ func (t *watcherBookmarkTimeBuckets) addWatcher(w *cacheWatcher) bool {
return true
}
func (t *watcherBookmarkTimeBuckets) popExpiredWatchers() [][]*cacheWatcher {
func (t *watcherBookmarkTimeBuckets) popExpiredWatchersThreadUnsafe() [][]*cacheWatcher {
currentBucketID := int64(t.clock.Since(t.createTime) / time.Second)
// There should be one or two elements in almost all cases
expiredWatchers := make([][]*cacheWatcher, 0, 2)
t.lock.Lock()
defer t.lock.Unlock()
for ; t.startBucketID <= currentBucketID; t.startBucketID++ {
if watchers, ok := t.watchersBuckets[t.startBucketID]; ok {
delete(t.watchersBuckets, t.startBucketID)
@ -328,11 +323,16 @@ type Cacher struct {
// dispatching that event to avoid race with closing channels in watchers.
watchersToStop []*cacheWatcher
// Maintain a timeout queue to send the bookmark event before the watcher times out.
// Note that this field when accessed MUST be protected by the Cacher.lock.
bookmarkWatchers *watcherBookmarkTimeBuckets
// expiredBookmarkWatchers is a list of watchers that were expired and need to be schedule for a next bookmark event
expiredBookmarkWatchers []*cacheWatcher
}
func (c *Cacher) RequestWatchProgress(ctx context.Context) error {
return c.storage.RequestWatchProgress(ctx)
}
// NewCacherFromConfig creates a new Cacher responsible for servicing WATCH and LIST requests from
// its internal cache and updating its cache in the background based on the
// given configuration.
@ -401,10 +401,10 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
// so that future reuse does not get a spurious timeout.
<-cacher.timer.C
}
progressRequester := newConditionalProgressRequester(config.Storage.RequestWatchProgress, config.Clock)
watchCache := newWatchCache(
config.KeyFunc, cacher.processEvent, config.GetAttrsFunc, config.Versioner, config.Indexers, config.Clock, config.GroupResource)
listerWatcher := NewCacherListerWatcher(config.Storage, config.ResourcePrefix, config.NewListFunc)
config.KeyFunc, cacher.processEvent, config.GetAttrsFunc, config.Versioner, config.Indexers, config.Clock, config.GroupResource, progressRequester)
listerWatcher := NewListerWatcher(config.Storage, config.ResourcePrefix, config.NewListFunc)
reflectorName := "storage/cacher.go:" + config.ResourcePrefix
reflector := cache.NewNamedReflector(reflectorName, listerWatcher, obj, watchCache, 0)
@ -423,6 +423,7 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
cacher.reflector = reflector
go cacher.dispatchEvents()
go progressRequester.Run(stopCh)
cacher.stopWg.Add(1)
go func() {
@ -592,6 +593,18 @@ func (c *Cacher) Watch(ctx context.Context, key string, opts storage.ListOptions
identifier,
)
// note that c.waitUntilWatchCacheFreshAndForceAllEvents must be called without
// the c.watchCache.RLock held otherwise we are at risk of a deadlock
// mainly because c.watchCache.processEvent method won't be able to make progress
//
// moreover even though the c.waitUntilWatchCacheFreshAndForceAllEvents acquires a lock
// it is safe to release the lock after the method finishes because we don't require
// any atomicity between the call to the method and further calls that actually get the events.
forceAllEvents, err := c.waitUntilWatchCacheFreshAndForceAllEvents(ctx, requestedWatchRV, opts)
if err != nil {
return newErrWatcher(err), nil
}
// We explicitly use thread unsafe version and do locking ourself to ensure that
// no new events will be processed in the meantime. The watchCache will be unlocked
// on return from this function.
@ -599,10 +612,7 @@ func (c *Cacher) Watch(ctx context.Context, key string, opts storage.ListOptions
// underlying watchCache is calling processEvent under its lock.
c.watchCache.RLock()
defer c.watchCache.RUnlock()
forceAllEvents, err := c.waitUntilWatchCacheFreshAndForceAllEvents(ctx, requestedWatchRV, opts)
if err != nil {
return newErrWatcher(err), nil
}
startWatchRV := startWatchResourceVersionFn()
var cacheInterval *watchCacheInterval
if forceAllEvents {
@ -638,7 +648,7 @@ func (c *Cacher) Watch(ctx context.Context, key string, opts storage.ListOptions
// Add it to the queue only when the client support watch bookmarks.
if watcher.allowWatchBookmarks {
c.bookmarkWatchers.addWatcher(watcher)
c.bookmarkWatchers.addWatcherThreadUnsafe(watcher)
}
c.watcherIdx++
}()
@ -716,17 +726,18 @@ func shouldDelegateList(opts storage.ListOptions) bool {
pred := opts.Predicate
match := opts.ResourceVersionMatch
pagingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.APIListChunking)
consistentListFromCacheEnabled := utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache)
// Serve consistent reads from storage if ConsistentListFromCache is disabled
consistentReadFromStorage := resourceVersion == "" && !consistentListFromCacheEnabled
// Watch cache doesn't support continuations, so serve them from etcd.
hasContinuation := pagingEnabled && len(pred.Continue) > 0
// Serve paginated requests about revision "0" from watch cache to avoid overwhelming etcd.
hasLimit := pagingEnabled && pred.Limit > 0 && resourceVersion != "0"
// Watch cache only supports ResourceVersionMatchNotOlderThan (default).
unsupportedMatch := match != "" && match != metav1.ResourceVersionMatchNotOlderThan
// If resourceVersion is not specified, serve it from underlying
// storage (for backward compatibility). If a continuation is
// requested, serve it from the underlying storage as well.
// Limits are only sent to storage when resourceVersion is non-zero
// since the watch cache isn't able to perform continuations, and
// limits are ignored when resource version is zero
return resourceVersion == "" || hasContinuation || hasLimit || unsupportedMatch
return consistentReadFromStorage || hasContinuation || hasLimit || unsupportedMatch
}
func (c *Cacher) listItems(ctx context.Context, listRV uint64, key string, pred storage.SelectionPredicate, recursive bool) ([]interface{}, uint64, string, error) {
@ -752,19 +763,21 @@ func (c *Cacher) GetList(ctx context.Context, key string, opts storage.ListOptio
return c.storage.GetList(ctx, key, opts, listObj)
}
// If resourceVersion is specified, serve it from cache.
// It's guaranteed that the returned value is at least that
// fresh as the given resourceVersion.
listRV, err := c.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return err
}
if listRV == 0 && !c.ready.check() {
// If Cacher is not yet initialized and we don't require any specific
// minimal resource version, simply forward the request to storage.
return c.storage.GetList(ctx, key, opts, listObj)
}
if listRV == 0 && utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) {
listRV, err = c.getCurrentResourceVersionFromStorage(ctx)
if err != nil {
return err
}
}
ctx, span := tracing.Start(ctx, "cacher list",
attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
@ -795,24 +808,30 @@ func (c *Cacher) GetList(ctx context.Context, key string, opts storage.ListOptio
return err
}
span.AddEvent("Listed items from cache", attribute.Int("count", len(objs)))
if len(objs) > listVal.Cap() && pred.Label.Empty() && pred.Field.Empty() {
// Resize the slice appropriately, since we already know that none
// of the elements will be filtered out.
listVal.Set(reflect.MakeSlice(reflect.SliceOf(c.objectType.Elem()), 0, len(objs)))
span.AddEvent("Resized result")
}
// store pointer of eligible objects,
// Why not directly put object in the items of listObj?
// the elements in ListObject are Struct type, making slice will bring excessive memory consumption.
// so we try to delay this action as much as possible
var selectedObjects []runtime.Object
for _, obj := range objs {
elem, ok := obj.(*storeElement)
if !ok {
return fmt.Errorf("non *storeElement returned from storage: %v", obj)
}
if filter(elem.Key, elem.Labels, elem.Fields) {
listVal.Set(reflect.Append(listVal, reflect.ValueOf(elem.Object).Elem()))
selectedObjects = append(selectedObjects, elem.Object)
}
}
if listVal.IsNil() {
if len(selectedObjects) == 0 {
// Ensure that we never return a nil Items pointer in the result for consistency.
listVal.Set(reflect.MakeSlice(listVal.Type(), 0, 0))
} else {
// Resize the slice appropriately, since we already know that size of result set
listVal.Set(reflect.MakeSlice(listVal.Type(), len(selectedObjects), len(selectedObjects)))
span.AddEvent("Resized result")
for i, o := range selectedObjects {
listVal.Index(i).Set(reflect.ValueOf(o).Elem())
}
}
span.AddEvent("Filtered items", attribute.Int("count", listVal.Len()))
if c.versioner != nil {
@ -911,9 +930,25 @@ func (c *Cacher) dispatchEvents() {
bookmarkTimer.Reset(wait.Jitter(time.Second, 0.25))
// Never send a bookmark event if we did not see an event here, this is fine
// because we don't provide any guarantees on sending bookmarks.
//
// Just pop closed watchers and requeue others if needed.
//
// TODO(#115478): rework the following logic
// in a way that would allow more
// efficient cleanup of closed watchers
if lastProcessedResourceVersion == 0 {
// pop expired watchers in case there has been no update
c.bookmarkWatchers.popExpiredWatchers()
func() {
c.Lock()
defer c.Unlock()
for _, watchers := range c.bookmarkWatchers.popExpiredWatchersThreadUnsafe() {
for _, watcher := range watchers {
if watcher.stopped {
continue
}
c.bookmarkWatchers.addWatcherThreadUnsafe(watcher)
}
}
}()
continue
}
bookmarkEvent := &watchCacheEvent{
@ -1035,7 +1070,7 @@ func (c *Cacher) dispatchEvent(event *watchCacheEvent) {
func (c *Cacher) startDispatchingBookmarkEventsLocked() {
// Pop already expired watchers. However, explicitly ignore stopped ones,
// as we don't delete watcher from bookmarkWatchers when it is stopped.
for _, watchers := range c.bookmarkWatchers.popExpiredWatchers() {
for _, watchers := range c.bookmarkWatchers.popExpiredWatchersThreadUnsafe() {
for _, watcher := range watchers {
// c.Lock() is held here.
// watcher.stopThreadUnsafe() is protected by c.Lock()
@ -1140,7 +1175,7 @@ func (c *Cacher) finishDispatching() {
continue
}
// requeue the watcher for the next bookmark if needed.
c.bookmarkWatchers.addWatcher(watcher)
c.bookmarkWatchers.addWatcherThreadUnsafe(watcher)
}
c.expiredBookmarkWatchers = c.expiredBookmarkWatchers[:0]
}
@ -1309,54 +1344,6 @@ func (c *Cacher) waitUntilWatchCacheFreshAndForceAllEvents(ctx context.Context,
return false, nil
}
// cacherListerWatcher opaques storage.Interface to expose cache.ListerWatcher.
type cacherListerWatcher struct {
storage storage.Interface
resourcePrefix string
newListFunc func() runtime.Object
}
// NewCacherListerWatcher returns a storage.Interface backed ListerWatcher.
func NewCacherListerWatcher(storage storage.Interface, resourcePrefix string, newListFunc func() runtime.Object) cache.ListerWatcher {
return &cacherListerWatcher{
storage: storage,
resourcePrefix: resourcePrefix,
newListFunc: newListFunc,
}
}
// Implements cache.ListerWatcher interface.
func (lw *cacherListerWatcher) List(options metav1.ListOptions) (runtime.Object, error) {
list := lw.newListFunc()
pred := storage.SelectionPredicate{
Label: labels.Everything(),
Field: fields.Everything(),
Limit: options.Limit,
Continue: options.Continue,
}
storageOpts := storage.ListOptions{
ResourceVersionMatch: options.ResourceVersionMatch,
Predicate: pred,
Recursive: true,
}
if err := lw.storage.GetList(context.TODO(), lw.resourcePrefix, storageOpts, list); err != nil {
return nil, err
}
return list, nil
}
// Implements cache.ListerWatcher interface.
func (lw *cacherListerWatcher) Watch(options metav1.ListOptions) (watch.Interface, error) {
opts := storage.ListOptions{
ResourceVersion: options.ResourceVersion,
Predicate: storage.Everything,
Recursive: true,
ProgressNotify: true,
}
return lw.storage.Watch(context.TODO(), lw.resourcePrefix, opts)
}
// errWatcher implements watch.Interface to return a single error
type errWatcher struct {
result chan watch.Event

View File

@ -148,6 +148,10 @@ func (o *cachingObject) CacheEncode(id runtime.Identifier, encode func(runtime.O
if result.err != nil {
return result.err
}
if b, support := w.(runtime.Splice); support {
b.Splice(result.raw)
return nil
}
_, err := w.Write(result.raw)
return err
}

View File

@ -0,0 +1,77 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/client-go/tools/cache"
)
// listerWatcher opaques storage.Interface to expose cache.ListerWatcher.
type listerWatcher struct {
storage storage.Interface
resourcePrefix string
newListFunc func() runtime.Object
}
// NewListerWatcher returns a storage.Interface backed ListerWatcher.
func NewListerWatcher(storage storage.Interface, resourcePrefix string, newListFunc func() runtime.Object) cache.ListerWatcher {
return &listerWatcher{
storage: storage,
resourcePrefix: resourcePrefix,
newListFunc: newListFunc,
}
}
// Implements cache.ListerWatcher interface.
func (lw *listerWatcher) List(options metav1.ListOptions) (runtime.Object, error) {
list := lw.newListFunc()
pred := storage.SelectionPredicate{
Label: labels.Everything(),
Field: fields.Everything(),
Limit: options.Limit,
Continue: options.Continue,
}
storageOpts := storage.ListOptions{
ResourceVersionMatch: options.ResourceVersionMatch,
Predicate: pred,
Recursive: true,
}
if err := lw.storage.GetList(context.TODO(), lw.resourcePrefix, storageOpts, list); err != nil {
return nil, err
}
return list, nil
}
// Implements cache.ListerWatcher interface.
func (lw *listerWatcher) Watch(options metav1.ListOptions) (watch.Interface, error) {
opts := storage.ListOptions{
ResourceVersion: options.ResourceVersion,
Predicate: storage.Everything,
Recursive: true,
ProgressNotify: true,
}
return lw.storage.Watch(context.TODO(), lw.resourcePrefix, opts)
}

View File

@ -30,8 +30,10 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/cacher/metrics"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/cache"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
@ -196,6 +198,10 @@ type watchCache struct {
// For testing cache interval invalidation.
indexValidator indexValidator
// Requests progress notification if there are requests waiting for watch
// to be fresh
waitingUntilFresh *conditionalProgressRequester
}
func newWatchCache(
@ -204,8 +210,9 @@ func newWatchCache(
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error),
versioner storage.Versioner,
indexers *cache.Indexers,
clock clock.Clock,
groupResource schema.GroupResource) *watchCache {
clock clock.WithTicker,
groupResource schema.GroupResource,
progressRequester *conditionalProgressRequester) *watchCache {
wc := &watchCache{
capacity: defaultLowerBoundCapacity,
keyFunc: keyFunc,
@ -222,6 +229,7 @@ func newWatchCache(
clock: clock,
versioner: versioner,
groupResource: groupResource,
waitingUntilFresh: progressRequester,
}
metrics.WatchCacheCapacity.WithLabelValues(groupResource.String()).Set(float64(wc.capacity))
wc.cond = sync.NewCond(wc.RLocker())
@ -305,7 +313,7 @@ func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, upd
if err := func() error {
// TODO: We should consider moving this lock below after the watchCacheEvent
// is created. In such situation, the only problematic scenario is Replace(
// is created. In such situation, the only problematic scenario is Replace()
// happening after getting object from store and before acquiring a lock.
// Maybe introduce another lock for this purpose.
w.Lock()
@ -406,6 +414,7 @@ func (w *watchCache) UpdateResourceVersion(resourceVersion string) {
w.Lock()
defer w.Unlock()
w.resourceVersion = rv
w.cond.Broadcast()
}()
// Avoid calling event handler under lock.
@ -484,7 +493,14 @@ func (s sortableStoreElements) Swap(i, j int) {
// WaitUntilFreshAndList returns list of pointers to `storeElement` objects along
// with their ResourceVersion and the name of the index, if any, that was used.
func (w *watchCache) WaitUntilFreshAndList(ctx context.Context, resourceVersion uint64, matchValues []storage.MatchValue) ([]interface{}, uint64, string, error) {
err := w.waitUntilFreshAndBlock(ctx, resourceVersion)
var err error
if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) && w.notFresh(resourceVersion) {
w.waitingUntilFresh.Add()
err = w.waitUntilFreshAndBlock(ctx, resourceVersion)
w.waitingUntilFresh.Remove()
} else {
err = w.waitUntilFreshAndBlock(ctx, resourceVersion)
}
defer w.RUnlock()
if err != nil {
return nil, 0, "", err
@ -507,6 +523,12 @@ func (w *watchCache) WaitUntilFreshAndList(ctx context.Context, resourceVersion
return result, rv, index, err
}
func (w *watchCache) notFresh(resourceVersion uint64) bool {
w.RLock()
defer w.RUnlock()
return resourceVersion > w.resourceVersion
}
// WaitUntilFreshAndGet returns a pointers to <storeElement> object.
func (w *watchCache) WaitUntilFreshAndGet(ctx context.Context, resourceVersion uint64, key string) (interface{}, bool, uint64, error) {
err := w.waitUntilFreshAndBlock(ctx, resourceVersion)
@ -608,8 +630,8 @@ func (w *watchCache) Resync() error {
}
func (w *watchCache) currentCapacity() int {
w.Lock()
defer w.Unlock()
w.RLock()
defer w.RUnlock()
return w.capacity
}

View File

@ -0,0 +1,121 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
"sync"
"time"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
const (
// progressRequestPeriod determines period of requesting progress
// from etcd when there is a request waiting for watch cache to be fresh.
progressRequestPeriod = 100 * time.Millisecond
)
func newConditionalProgressRequester(requestWatchProgress WatchProgressRequester, clock TickerFactory) *conditionalProgressRequester {
pr := &conditionalProgressRequester{
clock: clock,
requestWatchProgress: requestWatchProgress,
}
pr.cond = sync.NewCond(pr.mux.RLocker())
return pr
}
type WatchProgressRequester func(ctx context.Context) error
type TickerFactory interface {
NewTicker(time.Duration) clock.Ticker
}
// conditionalProgressRequester will request progress notification if there
// is a request waiting for watch cache to be fresh.
type conditionalProgressRequester struct {
clock TickerFactory
requestWatchProgress WatchProgressRequester
mux sync.RWMutex
cond *sync.Cond
waiting int
stopped bool
}
func (pr *conditionalProgressRequester) Run(stopCh <-chan struct{}) {
ctx := wait.ContextForChannel(stopCh)
go func() {
defer utilruntime.HandleCrash()
<-stopCh
pr.mux.Lock()
defer pr.mux.Unlock()
pr.stopped = true
pr.cond.Signal()
}()
ticker := pr.clock.NewTicker(progressRequestPeriod)
defer ticker.Stop()
for {
stopped := func() bool {
pr.mux.RLock()
defer pr.mux.RUnlock()
for pr.waiting == 0 && !pr.stopped {
pr.cond.Wait()
}
return pr.stopped
}()
if stopped {
return
}
select {
case <-ticker.C():
shouldRequest := func() bool {
pr.mux.RLock()
defer pr.mux.RUnlock()
return pr.waiting > 0 && !pr.stopped
}()
if !shouldRequest {
continue
}
err := pr.requestWatchProgress(ctx)
if err != nil {
klog.V(4).InfoS("Error requesting bookmark", "err", err)
}
case <-stopCh:
return
}
}
}
func (pr *conditionalProgressRequester) Add() {
pr.mux.Lock()
defer pr.mux.Unlock()
pr.waiting += 1
pr.cond.Signal()
}
func (pr *conditionalProgressRequester) Remove() {
pr.mux.Lock()
defer pr.mux.Unlock()
pr.waiting -= 1
pr.cond.Signal()
}

View File

@ -17,11 +17,14 @@ limitations under the License.
package metrics
import (
"context"
"fmt"
"sync"
"time"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
)
/*
@ -47,6 +50,22 @@ var (
},
[]string{"operation", "type"},
)
etcdRequestCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_requests_total",
Help: "Etcd request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
etcdRequestErrorCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_request_errors_total",
Help: "Etcd failed request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "apiserver_storage_objects",
@ -57,13 +76,16 @@ var (
)
dbTotalSize = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Subsystem: "apiserver",
Name: "storage_db_total_size_in_bytes",
Help: "Total size of the storage database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
Subsystem: "apiserver",
Name: "storage_db_total_size_in_bytes",
Help: "Total size of the storage database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
DeprecatedVersion: "1.28.0",
},
[]string{"endpoint"},
)
storageSizeDescription = compbasemetrics.NewDesc("apiserver_storage_size_bytes", "Size of the storage database file physically allocated in bytes.", []string{"cluster"}, nil, compbasemetrics.ALPHA, "")
storageMonitor = &monitorCollector{monitorGetter: func() ([]Monitor, error) { return nil, nil }}
etcdEventsReceivedCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Subsystem: "apiserver",
@ -140,8 +162,11 @@ func Register() {
// Register the metrics.
registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(etcdRequestCounts)
legacyregistry.MustRegister(etcdRequestErrorCounts)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.CustomMustRegister(storageMonitor)
legacyregistry.MustRegister(etcdBookmarkCounts)
legacyregistry.MustRegister(etcdLeaseObjectCounts)
legacyregistry.MustRegister(listStorageCount)
@ -157,9 +182,15 @@ func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
}
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics.
func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
// RecordEtcdRequest updates and sets the etcd_request_duration_seconds,
// etcd_request_total, etcd_request_errors_total metrics.
func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) {
v := []string{verb, resource}
etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime))
etcdRequestCounts.WithLabelValues(v...).Inc()
if err != nil {
etcdRequestErrorCounts.WithLabelValues(v...).Inc()
}
}
// RecordEtcdEvent updated the etcd_events_received_total metric.
@ -183,15 +214,23 @@ func Reset() {
}
// sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 {
//
// This is a variable to facilitate testing.
var sinceInSeconds = func(start time.Time) float64 {
return time.Since(start).Seconds()
}
// UpdateEtcdDbSize sets the etcd_db_total_size_in_bytes metric.
// Deprecated: Metric etcd_db_total_size_in_bytes will be replaced with apiserver_storage_size_bytes
func UpdateEtcdDbSize(ep string, size int64) {
dbTotalSize.WithLabelValues(ep).Set(float64(size))
}
// SetStorageMonitorGetter sets monitor getter to allow monitoring etcd stats.
func SetStorageMonitorGetter(getter func() ([]Monitor, error)) {
storageMonitor.monitorGetter = getter
}
// UpdateLeaseObjectCount sets the etcd_lease_object_counts metric.
func UpdateLeaseObjectCount(count int64) {
// Currently we only store one previous lease, since all the events have the same ttl.
@ -206,3 +245,51 @@ func RecordStorageListMetrics(resource string, numFetched, numEvald, numReturned
listStorageNumSelectorEvals.WithLabelValues(resource).Add(float64(numEvald))
listStorageNumReturned.WithLabelValues(resource).Add(float64(numReturned))
}
type Monitor interface {
Monitor(ctx context.Context) (StorageMetrics, error)
Close() error
}
type StorageMetrics struct {
Size int64
}
type monitorCollector struct {
compbasemetrics.BaseStableCollector
monitorGetter func() ([]Monitor, error)
}
// DescribeWithStability implements compbasemetrics.StableColletor
func (c *monitorCollector) DescribeWithStability(ch chan<- *compbasemetrics.Desc) {
ch <- storageSizeDescription
}
// CollectWithStability implements compbasemetrics.StableColletor
func (c *monitorCollector) CollectWithStability(ch chan<- compbasemetrics.Metric) {
monitors, err := c.monitorGetter()
if err != nil {
return
}
for i, m := range monitors {
cluster := fmt.Sprintf("etcd-%d", i)
klog.V(4).InfoS("Start collecting storage metrics", "cluster", cluster)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
metrics, err := m.Monitor(ctx)
cancel()
m.Close()
if err != nil {
klog.InfoS("Failed to get storage metrics", "cluster", cluster, "err", err)
continue
}
metric, err := compbasemetrics.NewConstMetric(storageSizeDescription, compbasemetrics.GaugeValue, float64(metrics.Size), cluster)
if err != nil {
klog.ErrorS(err, "Failed to create metric", "cluster", cluster)
}
ch <- metric
}
}

View File

@ -85,6 +85,12 @@ type store struct {
leaseManager *leaseManager
}
func (s *store) RequestWatchProgress(ctx context.Context) error {
// Use watchContext to match ctx metadata provided when creating the watch.
// In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache.
return s.client.RequestProgress(s.watchContext(ctx))
}
type objState struct {
obj runtime.Object
meta *storage.ResponseMeta
@ -136,7 +142,7 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou
}
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@ -210,7 +216,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
).Then(
clientv3.OpPut(preparedKey, string(newData), opts...),
).Commit()
metrics.RecordEtcdRequestLatency("create", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@ -255,7 +261,7 @@ func (s *store) conditionalDelete(
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
@ -337,7 +343,7 @@ func (s *store) conditionalDelete(
).Else(
clientv3.OpGet(key),
).Commit()
metrics.RecordEtcdRequestLatency("delete", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@ -391,7 +397,7 @@ func (s *store) GuaranteedUpdate(
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
@ -512,7 +518,7 @@ func (s *store) GuaranteedUpdate(
).Else(
clientv3.OpGet(preparedKey),
).Commit()
metrics.RecordEtcdRequestLatency("update", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@ -575,7 +581,7 @@ func (s *store) Count(key string) (int64, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
metrics.RecordEtcdRequestLatency("listWithCount", preparedKey, startTime)
metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
if err != nil {
return 0, err
}
@ -720,14 +726,16 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
numReturn := v.Len()
metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
}()
metricsOp := "get"
if recursive {
metricsOp = "list"
}
for {
startTime := time.Now()
getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
if recursive {
metrics.RecordEtcdRequestLatency("list", s.groupResourceString, startTime)
} else {
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
}
metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
if err != nil {
return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix)
}
@ -863,8 +871,12 @@ func growSlice(v reflect.Value, maxCapacity int, sizes ...int) {
}
// Watch implements storage.Interface.Watch.
// TODO(#115478): In order to graduate the WatchList feature to beta, the etcd3 implementation must/should also support it.
func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
if opts.SendInitialEvents != nil {
// it is safe to skip SendInitialEvents if the request is backward compatible
// see https://github.com/kubernetes/kubernetes/blob/267eb25e60955fe8e438c6311412e7cf7d028acb/staging/src/k8s.io/apiserver/pkg/storage/etcd3/watcher.go#L260
compatibility := opts.Predicate.AllowWatchBookmarks == false && (opts.ResourceVersion == "" || opts.ResourceVersion == "0")
if opts.SendInitialEvents != nil && !compatibility {
return nil, apierrors.NewInvalid(
schema.GroupKind{Group: s.groupResource.Group, Kind: s.groupResource.Resource},
"",
@ -879,7 +891,18 @@ func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions)
if err != nil {
return nil, err
}
return s.watcher.Watch(ctx, preparedKey, int64(rev), opts.Recursive, opts.ProgressNotify, s.transformer, opts.Predicate)
return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts.Recursive, opts.ProgressNotify, s.transformer, opts.Predicate)
}
func (s *store) watchContext(ctx context.Context) context.Context {
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
return clientv3.WithRequireLeader(ctx)
}
func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) {

View File

@ -144,15 +144,7 @@ func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, re
// The filter doesn't filter out any object.
wc.internalPred = storage.Everything
}
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
wc.ctx, wc.cancel = context.WithCancel(clientv3.WithRequireLeader(ctx))
wc.ctx, wc.cancel = context.WithCancel(ctx)
return wc
}
@ -223,6 +215,10 @@ func (wc *watchChan) ResultChan() <-chan watch.Event {
return wc.resultChan
}
func (wc *watchChan) RequestWatchProgress() error {
return wc.watcher.client.RequestProgress(wc.ctx)
}
// sync tries to retrieve existing data and send them to process.
// The revision to watch will be set to the revision in response.
// All events sent will have isCreated=true

View File

@ -236,6 +236,21 @@ type Interface interface {
// Count returns number of different entries under the key (generally being path prefix).
Count(key string) (int64, error)
// RequestWatchProgress requests the a watch stream progress status be sent in the
// watch response stream as soon as possible.
// Used for monitor watch progress even if watching resources with no changes.
//
// If watch is lagging, progress status might:
// * be pointing to stale resource version. Use etcd KV request to get linearizable resource version.
// * not be delivered at all. It's recommended to poll request progress periodically.
//
// Note: Only watches with matching context grpc metadata will be notified.
// https://github.com/kubernetes/kubernetes/blob/9325a57125e8502941d1b0c7379c4bb80a678d5c/vendor/go.etcd.io/etcd/client/v3/watch.go#L1037-L1042
//
// TODO: Remove when storage.Interface will be separate from etc3.store.
// Deprecated: Added temporarily to simplify exposing RequestProgress for watch cache.
RequestWatchProgress(ctx context.Context) error
}
// GetOptions provides the options that may be provided for storage get operations.

View File

@ -1,6 +1,5 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- lavalamp
- smarterclayton
- wojtek-t

View File

@ -20,6 +20,7 @@ import (
"context"
"fmt"
"log"
"math/rand"
"net"
"net/url"
"os"
@ -37,6 +38,7 @@ import (
"go.uber.org/zap/zapcore"
"golang.org/x/time/rate"
"google.golang.org/grpc"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/runtime"
utilnet "k8s.io/apimachinery/pkg/util/net"
@ -52,7 +54,6 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/metrics/legacyregistry"
tracing "k8s.io/component-base/tracing"
"k8s.io/klog/v2"
)
const (
@ -153,11 +154,11 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
// retry in a loop in the background until we successfully create the client, storing the client or error encountered
lock := sync.RWMutex{}
var prober *etcd3Prober
var prober *etcd3ProberMonitor
clientErr := fmt.Errorf("etcd client connection not yet established")
go wait.PollUntil(time.Second, func() (bool, error) {
newProber, err := newETCD3Prober(c)
newProber, err := newETCD3ProberMonitor(c)
lock.Lock()
defer lock.Unlock()
// Ensure that server is already not shutting down.
@ -221,49 +222,66 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
}, nil
}
func newETCD3Prober(c storagebackend.Config) (*etcd3Prober, error) {
func newETCD3ProberMonitor(c storagebackend.Config) (*etcd3ProberMonitor, error) {
client, err := newETCD3Client(c.Transport)
if err != nil {
return nil, err
}
return &etcd3Prober{
client: client,
prefix: c.Prefix,
return &etcd3ProberMonitor{
client: client,
prefix: c.Prefix,
endpoints: c.Transport.ServerList,
}, nil
}
type etcd3Prober struct {
prefix string
type etcd3ProberMonitor struct {
prefix string
endpoints []string
mux sync.RWMutex
client *clientv3.Client
closed bool
}
func (p *etcd3Prober) Close() error {
p.mux.Lock()
defer p.mux.Unlock()
if !p.closed {
p.closed = true
return p.client.Close()
func (t *etcd3ProberMonitor) Close() error {
t.mux.Lock()
defer t.mux.Unlock()
if !t.closed {
t.closed = true
return t.client.Close()
}
return fmt.Errorf("prober was closed")
return fmt.Errorf("closed")
}
func (p *etcd3Prober) Probe(ctx context.Context) error {
p.mux.RLock()
defer p.mux.RUnlock()
if p.closed {
return fmt.Errorf("prober was closed")
func (t *etcd3ProberMonitor) Probe(ctx context.Context) error {
t.mux.RLock()
defer t.mux.RUnlock()
if t.closed {
return fmt.Errorf("closed")
}
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
_, err := p.client.Get(ctx, path.Join("/", p.prefix, "health"))
_, err := t.client.Get(ctx, path.Join("/", t.prefix, "health"))
if err != nil {
return fmt.Errorf("error getting data from etcd: %w", err)
}
return nil
}
func (t *etcd3ProberMonitor) Monitor(ctx context.Context) (metrics.StorageMetrics, error) {
t.mux.RLock()
defer t.mux.RUnlock()
if t.closed {
return metrics.StorageMetrics{}, fmt.Errorf("closed")
}
status, err := t.client.Status(ctx, t.endpoints[rand.Int()%len(t.endpoints)])
if err != nil {
return metrics.StorageMetrics{}, err
}
return metrics.StorageMetrics{
Size: status.DbSize,
}, nil
}
var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
tlsInfo := transport.TLSInfo{
CertFile: c.CertFile,
@ -441,6 +459,7 @@ func newETCD3Storage(c storagebackend.ConfigForResource, newFunc func() runtime.
// startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the
// corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint.
// Deprecated: Will be replaced with newETCD3ProberMonitor
func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) {
if interval == 0 {
return func() {}, nil

View File

@ -22,6 +22,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/storagebackend"
)
@ -68,7 +69,18 @@ func CreateProber(c storagebackend.Config) (Prober, error) {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3Prober(c)
return newETCD3ProberMonitor(c)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
func CreateMonitor(c storagebackend.Config) (metrics.Monitor, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3ProberMonitor(c)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}

View File

@ -34,33 +34,11 @@ import (
"k8s.io/klog/v2"
)
type gcm struct {
aead cipher.AEAD
nonceFunc func([]byte) error
}
// commonSize is the length of various security sensitive byte slices such as encryption keys.
// Do not change this value. It would be a backward incompatible change.
const commonSize = 32
// NewGCMTransformer takes the given block cipher and performs encryption and decryption on the given data.
// It implements AEAD encryption of the provided values given a cipher.Block algorithm.
// The authenticated data provided as part of the value.Context method must match when the same
// value is set to and loaded from storage. In order to ensure that values cannot be copied by
// an attacker from a location under their control, use characteristics of the storage location
// (such as the etcd key) as part of the authenticated data.
//
// Because this mode requires a generated IV and IV reuse is a known weakness of AES-GCM, keys
// must be rotated before a birthday attack becomes feasible. NIST SP 800-38D
// (http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf) recommends using the same
// key with random 96-bit nonces (the default nonce length) no more than 2^32 times, and
// therefore transformers using this implementation *must* ensure they allow for frequent key
// rotation. Future work should include investigation of AES-GCM-SIV as an alternative to
// random nonces.
func NewGCMTransformer(block cipher.Block) (value.Transformer, error) {
aead, err := newGCM(block)
if err != nil {
return nil, err
}
return &gcm{aead: aead, nonceFunc: randomNonce}, nil
}
const keySizeCounterNonceGCM = commonSize
// NewGCMTransformerWithUniqueKeyUnsafe is the same as NewGCMTransformer but is unsafe for general
// use because it makes assumptions about the key underlying the block cipher. Specifically,
@ -78,7 +56,7 @@ func NewGCMTransformer(block cipher.Block) (value.Transformer, error) {
// it can be passed to NewGCMTransformer(aes.NewCipher(key)) to construct a transformer capable
// of decrypting values encrypted by this transformer (that transformer must not be used for encryption).
func NewGCMTransformerWithUniqueKeyUnsafe() (value.Transformer, []byte, error) {
key, err := generateKey(32)
key, err := GenerateKey(keySizeCounterNonceGCM)
if err != nil {
return nil, nil, err
}
@ -126,17 +104,6 @@ func newGCMTransformerWithUniqueKeyUnsafe(block cipher.Block, nonceGen *nonceGen
return &gcm{aead: aead, nonceFunc: nonceFunc}, nil
}
func newGCM(block cipher.Block) (cipher.AEAD, error) {
aead, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
if nonceSize := aead.NonceSize(); nonceSize != 12 { // all data in etcd will be broken if this ever changes
return nil, fmt.Errorf("crypto/cipher.NewGCM returned unexpected nonce size: %d", nonceSize)
}
return aead, nil
}
func randomNonce(b []byte) error {
_, err := rand.Read(b)
return err
@ -164,8 +131,8 @@ func die(msg string) {
klog.FatalDepth(1, msg)
}
// generateKey generates a random key using system randomness.
func generateKey(length int) (key []byte, err error) {
// GenerateKey generates a random key using system randomness.
func GenerateKey(length int) (key []byte, err error) {
defer func(start time.Time) {
value.RecordDataKeyGeneration(start, err)
}(time.Now())
@ -177,6 +144,45 @@ func generateKey(length int) (key []byte, err error) {
return key, nil
}
// NewGCMTransformer takes the given block cipher and performs encryption and decryption on the given data.
// It implements AEAD encryption of the provided values given a cipher.Block algorithm.
// The authenticated data provided as part of the value.Context method must match when the same
// value is set to and loaded from storage. In order to ensure that values cannot be copied by
// an attacker from a location under their control, use characteristics of the storage location
// (such as the etcd key) as part of the authenticated data.
//
// Because this mode requires a generated IV and IV reuse is a known weakness of AES-GCM, keys
// must be rotated before a birthday attack becomes feasible. NIST SP 800-38D
// (http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf) recommends using the same
// key with random 96-bit nonces (the default nonce length) no more than 2^32 times, and
// therefore transformers using this implementation *must* ensure they allow for frequent key
// rotation. Future work should include investigation of AES-GCM-SIV as an alternative to
// random nonces.
func NewGCMTransformer(block cipher.Block) (value.Transformer, error) {
aead, err := newGCM(block)
if err != nil {
return nil, err
}
return &gcm{aead: aead, nonceFunc: randomNonce}, nil
}
func newGCM(block cipher.Block) (cipher.AEAD, error) {
aead, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
if nonceSize := aead.NonceSize(); nonceSize != 12 { // all data in etcd will be broken if this ever changes
return nil, fmt.Errorf("crypto/cipher.NewGCM returned unexpected nonce size: %d", nonceSize)
}
return aead, nil
}
type gcm struct {
aead cipher.AEAD
nonceFunc func([]byte) error
}
func (t *gcm) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
nonceSize := t.aead.NonceSize()
if len(data) < nonceSize {

View File

@ -0,0 +1,186 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package aes
import (
"bytes"
"context"
"crypto/aes"
"crypto/sha256"
"errors"
"fmt"
"io"
"time"
"golang.org/x/crypto/hkdf"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/utils/clock"
)
const (
// cacheTTL is the TTL of KDF cache entries. We assume that the value.Context.AuthenticatedData
// for every call is the etcd storage path of the associated resource, and use that as the primary
// cache key (with a secondary check that confirms that the info matches). Thus if a client
// is constantly creating resources with new names (and thus new paths), they will keep adding new
// entries to the cache for up to this TTL before the GC logic starts deleting old entries. Each
// entry is ~300 bytes in size, so even a malicious client will be bounded in the overall memory
// it can consume.
cacheTTL = 10 * time.Minute
derivedKeySizeExtendedNonceGCM = commonSize
infoSizeExtendedNonceGCM
MinSeedSizeExtendedNonceGCM
)
// NewHKDFExtendedNonceGCMTransformer is the same as NewGCMTransformer but trades storage,
// memory and CPU to work around the limitations of AES-GCM's 12 byte nonce size. The input seed
// is assumed to be a cryptographically strong slice of MinSeedSizeExtendedNonceGCM+ random bytes.
// Unlike NewGCMTransformer, this function is immune to the birthday attack because a new key is generated
// per encryption via a key derivation function: KDF(seed, random_bytes) -> key. The derived key is
// only used once as an AES-GCM key with a random 12 byte nonce. This avoids any concerns around
// cryptographic wear out (by either number of encryptions or the amount of data being encrypted).
// Speaking on the cryptographic safety, the limit on the number of operations that can be preformed
// with a single seed with derived keys and randomly generated nonces is not practically reachable.
// Thus, the scheme does not impose any specific requirements on the seed rotation schedule.
// Reusing the same seed is safe to do over time and across process restarts. Whenever a new
// seed is needed, the caller should generate it via GenerateKey(MinSeedSizeExtendedNonceGCM).
// In regard to KMSv2, organization standards or compliance policies around rotation may require
// that the seed be rotated at some interval. This can be implemented externally by rotating
// the key encryption key via a key ID change.
func NewHKDFExtendedNonceGCMTransformer(seed []byte) (value.Transformer, error) {
if seedLen := len(seed); seedLen < MinSeedSizeExtendedNonceGCM {
return nil, fmt.Errorf("invalid seed length %d used for key generation", seedLen)
}
return &extendedNonceGCM{
seed: seed,
cache: newSimpleCache(clock.RealClock{}, cacheTTL),
}, nil
}
type extendedNonceGCM struct {
seed []byte
cache *simpleCache
}
func (e *extendedNonceGCM) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
if len(data) < infoSizeExtendedNonceGCM {
return nil, false, errors.New("the stored data was shorter than the required size")
}
info := data[:infoSizeExtendedNonceGCM]
transformer, err := e.derivedKeyTransformer(info, dataCtx, false)
if err != nil {
return nil, false, fmt.Errorf("failed to derive read key from KDF: %w", err)
}
return transformer.TransformFromStorage(ctx, data, dataCtx)
}
func (e *extendedNonceGCM) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
info := make([]byte, infoSizeExtendedNonceGCM)
if err := randomNonce(info); err != nil {
return nil, fmt.Errorf("failed to generate info for KDF: %w", err)
}
transformer, err := e.derivedKeyTransformer(info, dataCtx, true)
if err != nil {
return nil, fmt.Errorf("failed to derive write key from KDF: %w", err)
}
return transformer.TransformToStorage(ctx, data, dataCtx)
}
func (e *extendedNonceGCM) derivedKeyTransformer(info []byte, dataCtx value.Context, write bool) (value.Transformer, error) {
if !write { // no need to check cache on write since we always generate a new transformer
if transformer := e.cache.get(info, dataCtx); transformer != nil {
return transformer, nil
}
// on read, this is a subslice of a much larger slice and we do not want to hold onto that larger slice
info = bytes.Clone(info)
}
key, err := e.sha256KDFExpandOnly(info)
if err != nil {
return nil, fmt.Errorf("failed to KDF expand seed with info: %w", err)
}
transformer, err := newGCMTransformerWithInfo(key, info)
if err != nil {
return nil, fmt.Errorf("failed to build transformer with KDF derived key: %w", err)
}
e.cache.set(dataCtx, transformer)
return transformer, nil
}
func (e *extendedNonceGCM) sha256KDFExpandOnly(info []byte) ([]byte, error) {
kdf := hkdf.Expand(sha256.New, e.seed, info)
derivedKey := make([]byte, derivedKeySizeExtendedNonceGCM)
if _, err := io.ReadFull(kdf, derivedKey); err != nil {
return nil, fmt.Errorf("failed to read a derived key from KDF: %w", err)
}
return derivedKey, nil
}
func newGCMTransformerWithInfo(key, info []byte) (*transformerWithInfo, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
transformer, err := NewGCMTransformer(block)
if err != nil {
return nil, err
}
return &transformerWithInfo{transformer: transformer, info: info}, nil
}
type transformerWithInfo struct {
transformer value.Transformer
// info are extra opaque bytes prepended to the writes from transformer and stripped from reads.
// currently info is used to generate a key via KDF(seed, info) -> key
// and transformer is the output of NewGCMTransformer(aes.NewCipher(key))
info []byte
}
func (t *transformerWithInfo) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
if !bytes.HasPrefix(data, t.info) {
return nil, false, errors.New("the stored data is missing the required info prefix")
}
return t.transformer.TransformFromStorage(ctx, data[len(t.info):], dataCtx)
}
func (t *transformerWithInfo) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
out, err := t.transformer.TransformToStorage(ctx, data, dataCtx)
if err != nil {
return nil, err
}
outWithInfo := make([]byte, 0, len(out)+len(t.info))
outWithInfo = append(outWithInfo, t.info...)
outWithInfo = append(outWithInfo, out...)
return outWithInfo, nil
}

View File

@ -0,0 +1,91 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package aes
import (
"bytes"
"time"
"unsafe"
utilcache "k8s.io/apimachinery/pkg/util/cache"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/utils/clock"
)
type simpleCache struct {
cache *utilcache.Expiring
ttl time.Duration
}
func newSimpleCache(clock clock.Clock, ttl time.Duration) *simpleCache {
cache := utilcache.NewExpiringWithClock(clock)
// "Stale" entries are always valid for us because the TTL is just used to prevent
// unbounded growth on the cache - for a given info the transformer is always the same.
// The key always corresponds to the exact same value, with the caveat that
// since we use the value.Context.AuthenticatedData to overwrite old keys,
// we always have to check that the info matches (to validate the transformer is correct).
cache.AllowExpiredGet = true
return &simpleCache{
cache: cache,
ttl: ttl,
}
}
// given a key, return the transformer, or nil if it does not exist in the cache
func (c *simpleCache) get(info []byte, dataCtx value.Context) *transformerWithInfo {
val, ok := c.cache.Get(keyFunc(dataCtx))
if !ok {
return nil
}
transformer := val.(*transformerWithInfo)
if !bytes.Equal(transformer.info, info) {
return nil
}
return transformer
}
// set caches the record for the key
func (c *simpleCache) set(dataCtx value.Context, transformer *transformerWithInfo) {
if dataCtx == nil || len(dataCtx.AuthenticatedData()) == 0 {
panic("authenticated data must not be empty")
}
if transformer == nil {
panic("transformer must not be nil")
}
if len(transformer.info) == 0 {
panic("info must not be empty")
}
c.cache.Set(keyFunc(dataCtx), transformer, c.ttl)
}
func keyFunc(dataCtx value.Context) string {
return toString(dataCtx.AuthenticatedData())
}
// toString performs unholy acts to avoid allocations
func toString(b []byte) string {
// unsafe.SliceData relies on cap whereas we want to rely on len
if len(b) == 0 {
return ""
}
// Copied from go 1.20.1 strings.Builder.String
// https://github.com/golang/go/blob/202a1a57064127c3f19d96df57b9f9586145e21c/src/strings/builder.go#L48
return unsafe.String(unsafe.SliceData(b), len(b))
}

View File

@ -18,7 +18,6 @@ limitations under the License.
package kmsv2
import (
"context"
"crypto/sha256"
"hash"
"sync"
@ -30,17 +29,10 @@ import (
"k8s.io/utils/clock"
)
// prevent decryptTransformer from drifting from value.Transformer
var _ decryptTransformer = value.Transformer(nil)
// decryptTransformer is the decryption subset of value.Transformer.
// this exists purely to statically enforce that transformers placed in the cache are not used for encryption.
// simpleCache stores the decryption subset of value.Transformer (value.Read).
// this statically enforces that transformers placed in the cache are not used for encryption.
// this is relevant in the context of nonce collision since transformers that are created
// from encrypted DEKs retrieved from etcd cannot maintain their nonce counter state.
type decryptTransformer interface {
TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) (out []byte, stale bool, err error)
}
type simpleCache struct {
cache *utilcache.Expiring
ttl time.Duration
@ -50,8 +42,10 @@ type simpleCache struct {
}
func newSimpleCache(clock clock.Clock, ttl time.Duration) *simpleCache {
cache := utilcache.NewExpiringWithClock(clock)
cache.AllowExpiredGet = true // for a given key, the value (the decryptTransformer) is always the same
return &simpleCache{
cache: utilcache.NewExpiringWithClock(clock),
cache: cache,
ttl: ttl,
hashPool: &sync.Pool{
New: func() interface{} {
@ -62,16 +56,16 @@ func newSimpleCache(clock clock.Clock, ttl time.Duration) *simpleCache {
}
// given a key, return the transformer, or nil if it does not exist in the cache
func (c *simpleCache) get(key []byte) decryptTransformer {
func (c *simpleCache) get(key []byte) value.Read {
record, ok := c.cache.Get(c.keyFunc(key))
if !ok {
return nil
}
return record.(decryptTransformer)
return record.(value.Read)
}
// set caches the record for the key
func (c *simpleCache) set(key []byte, transformer decryptTransformer) {
func (c *simpleCache) set(key []byte, transformer value.Read) {
if len(key) == 0 {
panic("key must not be empty")
}

View File

@ -20,6 +20,8 @@ package kmsv2
import (
"context"
"crypto/aes"
"crypto/cipher"
"crypto/sha256"
"fmt"
"sort"
"time"
@ -42,6 +44,8 @@ import (
"k8s.io/utils/clock"
)
// TODO integration test with old AES GCM data recorded and new KDF data recorded
func init() {
value.RegisterMetrics()
metrics.RegisterMetrics()
@ -54,22 +58,22 @@ const (
annotationsMaxSize = 32 * 1024 // 32 kB
// KeyIDMaxSize is the maximum size of the keyID.
KeyIDMaxSize = 1 * 1024 // 1 kB
// encryptedDEKMaxSize is the maximum size of the encrypted DEK.
encryptedDEKMaxSize = 1 * 1024 // 1 kB
// encryptedDEKSourceMaxSize is the maximum size of the encrypted DEK source.
encryptedDEKSourceMaxSize = 1 * 1024 // 1 kB
// cacheTTL is the default time-to-live for the cache entry.
// this allows the cache to grow to an infinite size for up to a day.
// this is meant as a temporary solution until the cache is re-written to not have a TTL.
// there is unlikely to be any meaningful memory impact on the server
// because the cache will likely never have more than a few thousand entries
// and each entry is roughly ~200 bytes in size. with DEK reuse
// and no storage migration, the number of entries in this cache
// because the cache will likely never have more than a few thousand entries.
// each entry can be large due to an internal cache that maps the DEK seed to individual
// DEK entries, but that cache has an aggressive TTL to keep the size under control.
// with DEK/seed reuse and no storage migration, the number of entries in this cache
// would be approximated by unique key IDs used by the KMS plugin
// combined with the number of server restarts. If storage migration
// is performed after key ID changes, and the number of restarts
// is limited, this cache size may be as small as the number of API
// servers in use (once old entries expire out from the TTL).
cacheTTL = 24 * time.Hour
// error code
// key ID related error codes for metrics
errKeyIDOKCode ErrCodeKeyID = "ok"
errKeyIDEmptyCode ErrCodeKeyID = "empty"
errKeyIDTooLongCode ErrCodeKeyID = "too_long"
@ -82,23 +86,22 @@ type StateFunc func() (State, error)
type ErrCodeKeyID string
type State struct {
Transformer value.Transformer
EncryptedDEK []byte
KeyID string
Annotations map[string][]byte
Transformer value.Transformer
EncryptedObject kmstypes.EncryptedObject
UID string
ExpirationTimestamp time.Time
// CacheKey is the key used to cache the DEK in transformer.cache.
// CacheKey is the key used to cache the DEK/seed in envelopeTransformer.cache.
CacheKey []byte
}
func (s *State) ValidateEncryptCapability() error {
if now := NowFunc(); now.After(s.ExpirationTimestamp) {
return fmt.Errorf("EDEK with keyID %q expired at %s (current time is %s)",
s.KeyID, s.ExpirationTimestamp.Format(time.RFC3339), now.Format(time.RFC3339))
return fmt.Errorf("encryptedDEKSource with keyID hash %q expired at %s (current time is %s)",
GetHashIfNotEmpty(s.EncryptedObject.KeyID), s.ExpirationTimestamp.Format(time.RFC3339), now.Format(time.RFC3339))
}
return nil
}
@ -136,6 +139,8 @@ func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []b
return nil, false, err
}
useSeed := encryptedObject.EncryptedDEKSourceType == kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED
// TODO: consider marking state.EncryptedDEK != encryptedObject.EncryptedDEK as a stale read to support DEK defragmentation
// at a minimum we should have a metric that helps the user understand if DEK fragmentation is high
state, err := t.stateFunc() // no need to call state.ValidateEncryptCapability on reads
@ -143,7 +148,7 @@ func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []b
return nil, false, err
}
encryptedObjectCacheKey, err := generateCacheKey(encryptedObject.EncryptedDEK, encryptedObject.KeyID, encryptedObject.Annotations)
encryptedObjectCacheKey, err := generateCacheKey(encryptedObject.EncryptedDEKSourceType, encryptedObject.EncryptedDEKSource, encryptedObject.KeyID, encryptedObject.Annotations)
if err != nil {
return nil, false, err
}
@ -162,7 +167,7 @@ func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []b
"verb", requestInfo.Verb, "namespace", requestInfo.Namespace, "name", requestInfo.Name)
key, err := t.envelopeService.Decrypt(ctx, uid, &kmsservice.DecryptRequest{
Ciphertext: encryptedObject.EncryptedDEK,
Ciphertext: encryptedObject.EncryptedDEKSource,
KeyID: encryptedObject.KeyID,
Annotations: encryptedObject.Annotations,
})
@ -170,7 +175,7 @@ func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []b
return nil, false, fmt.Errorf("failed to decrypt DEK, error: %w", err)
}
transformer, err = t.addTransformerForDecryption(encryptedObjectCacheKey, key)
transformer, err = t.addTransformerForDecryption(encryptedObjectCacheKey, key, useSeed)
if err != nil {
return nil, false, err
}
@ -183,8 +188,11 @@ func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []b
}
// data is considered stale if the key ID does not match our current write transformer
return out, stale || encryptedObject.KeyID != state.KeyID, nil
return out,
stale ||
encryptedObject.KeyID != state.EncryptedObject.KeyID ||
encryptedObject.EncryptedDEKSourceType != state.EncryptedObject.EncryptedDEKSourceType,
nil
}
// TransformToStorage encrypts data to be written to disk using envelope encryption.
@ -200,7 +208,7 @@ func (t *envelopeTransformer) TransformToStorage(ctx context.Context, data []byt
// this prevents a cache miss every time the DEK rotates
// this has the side benefit of causing the cache to perform a GC
// TODO see if we can do this inside the stateFunc control loop
// TODO(aramase): Add metrics for cache fill percentage with custom cache implementation.
// TODO(aramase): Add metrics for cache size.
t.cache.set(state.CacheKey, state.Transformer)
requestInfo := getRequestInfoFromContext(ctx)
@ -213,39 +221,43 @@ func (t *envelopeTransformer) TransformToStorage(ctx context.Context, data []byt
return nil, err
}
metrics.RecordKeyID(metrics.ToStorageLabel, t.providerName, state.KeyID)
metrics.RecordKeyID(metrics.ToStorageLabel, t.providerName, state.EncryptedObject.KeyID)
encObject := &kmstypes.EncryptedObject{
KeyID: state.KeyID,
EncryptedDEK: state.EncryptedDEK,
EncryptedData: result,
Annotations: state.Annotations,
}
encObjectCopy := state.EncryptedObject
encObjectCopy.EncryptedData = result
// Serialize the EncryptedObject to a byte array.
return t.doEncode(encObject)
return t.doEncode(&encObjectCopy)
}
// addTransformerForDecryption inserts a new transformer to the Envelope cache of DEKs for future reads.
func (t *envelopeTransformer) addTransformerForDecryption(cacheKey []byte, key []byte) (decryptTransformer, error) {
block, err := aes.NewCipher(key)
func (t *envelopeTransformer) addTransformerForDecryption(cacheKey []byte, key []byte, useSeed bool) (value.Read, error) {
var transformer value.Read
var err error
if useSeed {
// the input key is considered safe to use here because it is coming from the KMS plugin / etcd
transformer, err = aestransformer.NewHKDFExtendedNonceGCMTransformer(key)
} else {
var block cipher.Block
block, err = aes.NewCipher(key)
if err != nil {
return nil, err
}
// this is compatible with NewGCMTransformerWithUniqueKeyUnsafe for decryption
// it would use random nonces for encryption but we never do that
transformer, err = aestransformer.NewGCMTransformer(block)
}
if err != nil {
return nil, err
}
// this is compatible with NewGCMTransformerWithUniqueKeyUnsafe for decryption
// it would use random nonces for encryption but we never do that
transformer, err := aestransformer.NewGCMTransformer(block)
if err != nil {
return nil, err
}
// TODO(aramase): Add metrics for cache fill percentage with custom cache implementation.
// TODO(aramase): Add metrics for cache size.
t.cache.set(cacheKey, transformer)
return transformer, nil
}
// doEncode encodes the EncryptedObject to a byte array.
func (t *envelopeTransformer) doEncode(request *kmstypes.EncryptedObject) ([]byte, error) {
if err := validateEncryptedObject(request); err != nil {
if err := ValidateEncryptedObject(request); err != nil {
return nil, err
}
return proto.Marshal(request)
@ -257,16 +269,31 @@ func (t *envelopeTransformer) doDecode(originalData []byte) (*kmstypes.Encrypted
if err := proto.Unmarshal(originalData, o); err != nil {
return nil, err
}
// validate the EncryptedObject
if err := validateEncryptedObject(o); err != nil {
if err := ValidateEncryptedObject(o); err != nil {
return nil, err
}
return o, nil
}
func GenerateTransformer(ctx context.Context, uid string, envelopeService kmsservice.Service) (value.Transformer, *kmsservice.EncryptResponse, []byte, error) {
transformer, newKey, err := aestransformer.NewGCMTransformerWithUniqueKeyUnsafe()
// GenerateTransformer generates a new transformer and encrypts the DEK/seed using the envelope service.
// It returns the transformer, the encrypted DEK/seed, cache key and error.
func GenerateTransformer(ctx context.Context, uid string, envelopeService kmsservice.Service, useSeed bool) (value.Transformer, *kmstypes.EncryptedObject, []byte, error) {
newTransformerFunc := func() (value.Transformer, []byte, error) {
seed, err := aestransformer.GenerateKey(aestransformer.MinSeedSizeExtendedNonceGCM)
if err != nil {
return nil, nil, err
}
transformer, err := aestransformer.NewHKDFExtendedNonceGCMTransformer(seed)
if err != nil {
return nil, nil, err
}
return transformer, seed, nil
}
if !useSeed {
newTransformerFunc = aestransformer.NewGCMTransformerWithUniqueKeyUnsafe
}
transformer, newKey, err := newTransformerFunc()
if err != nil {
return nil, nil, nil, err
}
@ -278,32 +305,48 @@ func GenerateTransformer(ctx context.Context, uid string, envelopeService kmsser
return nil, nil, nil, fmt.Errorf("failed to encrypt DEK, error: %w", err)
}
if err := validateEncryptedObject(&kmstypes.EncryptedObject{
KeyID: resp.KeyID,
EncryptedDEK: resp.Ciphertext,
EncryptedData: []byte{0}, // any non-empty value to pass validation
Annotations: resp.Annotations,
}); err != nil {
o := &kmstypes.EncryptedObject{
KeyID: resp.KeyID,
EncryptedDEKSource: resp.Ciphertext,
EncryptedData: []byte{0}, // any non-empty value to pass validation
Annotations: resp.Annotations,
}
if useSeed {
o.EncryptedDEKSourceType = kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED
} else {
o.EncryptedDEKSourceType = kmstypes.EncryptedDEKSourceType_AES_GCM_KEY
}
if err := ValidateEncryptedObject(o); err != nil {
return nil, nil, nil, err
}
cacheKey, err := generateCacheKey(resp.Ciphertext, resp.KeyID, resp.Annotations)
cacheKey, err := generateCacheKey(o.EncryptedDEKSourceType, resp.Ciphertext, resp.KeyID, resp.Annotations)
if err != nil {
return nil, nil, nil, err
}
return transformer, resp, cacheKey, nil
o.EncryptedData = nil // make sure that later code that uses this encrypted object sets this field
return transformer, o, cacheKey, nil
}
func validateEncryptedObject(o *kmstypes.EncryptedObject) error {
func ValidateEncryptedObject(o *kmstypes.EncryptedObject) error {
if o == nil {
return fmt.Errorf("encrypted object is nil")
}
switch t := o.EncryptedDEKSourceType; t {
case kmstypes.EncryptedDEKSourceType_AES_GCM_KEY:
case kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED:
default:
return fmt.Errorf("unknown encryptedDEKSourceType: %d", t)
}
if len(o.EncryptedData) == 0 {
return fmt.Errorf("encrypted data is empty")
}
if err := validateEncryptedDEK(o.EncryptedDEK); err != nil {
return fmt.Errorf("failed to validate encrypted DEK: %w", err)
if err := validateEncryptedDEKSource(o.EncryptedDEKSource); err != nil {
return fmt.Errorf("failed to validate encrypted DEK source: %w", err)
}
if _, err := ValidateKeyID(o.KeyID); err != nil {
return fmt.Errorf("failed to validate key id: %w", err)
@ -314,15 +357,15 @@ func validateEncryptedObject(o *kmstypes.EncryptedObject) error {
return nil
}
// validateEncryptedDEK tests the following:
// 1. The encrypted DEK is not empty.
// 2. The size of encrypted DEK is less than 1 kB.
func validateEncryptedDEK(encryptedDEK []byte) error {
if len(encryptedDEK) == 0 {
return fmt.Errorf("encrypted DEK is empty")
// validateEncryptedDEKSource tests the following:
// 1. The encrypted DEK source is not empty.
// 2. The size of encrypted DEK source is less than 1 kB.
func validateEncryptedDEKSource(encryptedDEKSource []byte) error {
if len(encryptedDEKSource) == 0 {
return fmt.Errorf("encrypted DEK source is empty")
}
if len(encryptedDEK) > encryptedDEKMaxSize {
return fmt.Errorf("encrypted DEK is %d bytes, which exceeds the max size of %d", len(encryptedDEK), encryptedDEKMaxSize)
if len(encryptedDEKSource) > encryptedDEKSourceMaxSize {
return fmt.Errorf("encrypted DEK source is %d bytes, which exceeds the max size of %d", len(encryptedDEKSource), encryptedDEKSourceMaxSize)
}
return nil
}
@ -367,17 +410,19 @@ func getRequestInfoFromContext(ctx context.Context) *genericapirequest.RequestIn
// generateCacheKey returns a key for the cache.
// The key is a concatenation of:
// 1. encryptedDEK
// 0. encryptedDEKSourceType
// 1. encryptedDEKSource
// 2. keyID
// 3. length of annotations
// 4. annotations (sorted by key) - each annotation is a concatenation of:
// a. annotation key
// b. annotation value
func generateCacheKey(encryptedDEK []byte, keyID string, annotations map[string][]byte) ([]byte, error) {
func generateCacheKey(encryptedDEKSourceType kmstypes.EncryptedDEKSourceType, encryptedDEKSource []byte, keyID string, annotations map[string][]byte) ([]byte, error) {
// TODO(aramase): use sync pool buffer to avoid allocations
b := cryptobyte.NewBuilder(nil)
b.AddUint32(uint32(encryptedDEKSourceType))
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(encryptedDEK)
b.AddBytes(encryptedDEKSource)
})
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(toBytes(keyID))
@ -420,3 +465,11 @@ func toBytes(s string) []byte {
// https://github.com/golang/go/blob/202a1a57064127c3f19d96df57b9f9586145e21c/src/os/file.go#L246
return unsafe.Slice(unsafe.StringData(s), len(s))
}
// GetHashIfNotEmpty returns the sha256 hash of the data if it is not empty.
func GetHashIfNotEmpty(data string) string {
if len(data) > 0 {
return fmt.Sprintf("sha256:%x", sha256.Sum256([]byte(data)))
}
return ""
}

View File

@ -36,19 +36,52 @@ var _ = math.Inf
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
type EncryptedDEKSourceType int32
const (
// AES_GCM_KEY means that the plaintext of encryptedDEKSource is the DEK itself, with AES-GCM as the encryption algorithm.
EncryptedDEKSourceType_AES_GCM_KEY EncryptedDEKSourceType = 0
// HKDF_SHA256_XNONCE_AES_GCM_SEED means that the plaintext of encryptedDEKSource is the pseudo random key
// (referred to as the seed throughout the code) that is fed into HKDF expand. SHA256 is the hash algorithm
// and first 32 bytes of encryptedData are the info param. The first 32 bytes from the HKDF stream are used
// as the DEK with AES-GCM as the encryption algorithm.
EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED EncryptedDEKSourceType = 1
)
var EncryptedDEKSourceType_name = map[int32]string{
0: "AES_GCM_KEY",
1: "HKDF_SHA256_XNONCE_AES_GCM_SEED",
}
var EncryptedDEKSourceType_value = map[string]int32{
"AES_GCM_KEY": 0,
"HKDF_SHA256_XNONCE_AES_GCM_SEED": 1,
}
func (x EncryptedDEKSourceType) String() string {
return proto.EnumName(EncryptedDEKSourceType_name, int32(x))
}
func (EncryptedDEKSourceType) EnumDescriptor() ([]byte, []int) {
return fileDescriptor_00212fb1f9d3bf1c, []int{0}
}
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
type EncryptedObject struct {
// EncryptedData is the encrypted data.
EncryptedData []byte `protobuf:"bytes,1,opt,name=encryptedData,proto3" json:"encryptedData,omitempty"`
// KeyID is the KMS key ID used for encryption operations.
KeyID string `protobuf:"bytes,2,opt,name=keyID,proto3" json:"keyID,omitempty"`
// EncryptedDEK is the encrypted DEK.
EncryptedDEK []byte `protobuf:"bytes,3,opt,name=encryptedDEK,proto3" json:"encryptedDEK,omitempty"`
// EncryptedDEKSource is the ciphertext of the source of the DEK used to encrypt the data stored in encryptedData.
// encryptedDEKSourceType defines the process of using the plaintext of this field to determine the aforementioned DEK.
EncryptedDEKSource []byte `protobuf:"bytes,3,opt,name=encryptedDEKSource,proto3" json:"encryptedDEKSource,omitempty"`
// Annotations is additional metadata that was provided by the KMS plugin.
Annotations map[string][]byte `protobuf:"bytes,4,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
Annotations map[string][]byte `protobuf:"bytes,4,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
// encryptedDEKSourceType defines the process of using the plaintext of encryptedDEKSource to determine the DEK.
EncryptedDEKSourceType EncryptedDEKSourceType `protobuf:"varint,5,opt,name=encryptedDEKSourceType,proto3,enum=v2.EncryptedDEKSourceType" json:"encryptedDEKSourceType,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *EncryptedObject) Reset() { *m = EncryptedObject{} }
@ -89,9 +122,9 @@ func (m *EncryptedObject) GetKeyID() string {
return ""
}
func (m *EncryptedObject) GetEncryptedDEK() []byte {
func (m *EncryptedObject) GetEncryptedDEKSource() []byte {
if m != nil {
return m.EncryptedDEK
return m.EncryptedDEKSource
}
return nil
}
@ -103,7 +136,15 @@ func (m *EncryptedObject) GetAnnotations() map[string][]byte {
return nil
}
func (m *EncryptedObject) GetEncryptedDEKSourceType() EncryptedDEKSourceType {
if m != nil {
return m.EncryptedDEKSourceType
}
return EncryptedDEKSourceType_AES_GCM_KEY
}
func init() {
proto.RegisterEnum("v2.EncryptedDEKSourceType", EncryptedDEKSourceType_name, EncryptedDEKSourceType_value)
proto.RegisterType((*EncryptedObject)(nil), "v2.EncryptedObject")
proto.RegisterMapType((map[string][]byte)(nil), "v2.EncryptedObject.AnnotationsEntry")
}
@ -111,21 +152,26 @@ func init() {
func init() { proto.RegisterFile("api.proto", fileDescriptor_00212fb1f9d3bf1c) }
var fileDescriptor_00212fb1f9d3bf1c = []byte{
// 244 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x5c, 0x90, 0xb1, 0x4b, 0x03, 0x31,
0x14, 0xc6, 0xc9, 0x9d, 0x0a, 0x97, 0x9e, 0x58, 0x82, 0xc3, 0xe1, 0x74, 0x94, 0x0e, 0x37, 0x25,
0x10, 0x97, 0x22, 0x52, 0x50, 0x7a, 0x82, 0x38, 0x08, 0x19, 0xdd, 0xd2, 0xfa, 0x28, 0x67, 0x6a,
0x12, 0x92, 0x18, 0xc8, 0x9f, 0xee, 0x26, 0x4d, 0x95, 0xda, 0xdb, 0xde, 0xf7, 0xf1, 0xfb, 0xe0,
0xc7, 0xc3, 0x95, 0xb4, 0x03, 0xb5, 0xce, 0x04, 0x43, 0x8a, 0xc8, 0x67, 0xdf, 0x08, 0x5f, 0xf5,
0x7a, 0xe3, 0x92, 0x0d, 0xf0, 0xfe, 0xba, 0xfe, 0x80, 0x4d, 0x20, 0x73, 0x7c, 0x09, 0x7f, 0xd5,
0x4a, 0x06, 0xd9, 0xa0, 0x16, 0x75, 0xb5, 0x38, 0x2d, 0xc9, 0x35, 0x3e, 0x57, 0x90, 0x9e, 0x57,
0x4d, 0xd1, 0xa2, 0xae, 0x12, 0x87, 0x40, 0x66, 0xb8, 0x3e, 0x62, 0xfd, 0x4b, 0x53, 0xe6, 0xe9,
0x49, 0x47, 0x9e, 0xf0, 0x44, 0x6a, 0x6d, 0x82, 0x0c, 0x83, 0xd1, 0xbe, 0x39, 0x6b, 0xcb, 0x6e,
0xc2, 0xe7, 0x34, 0x72, 0x3a, 0x32, 0xa1, 0x0f, 0x47, 0xac, 0xd7, 0xc1, 0x25, 0xf1, 0x7f, 0x78,
0xb3, 0xc4, 0xd3, 0x31, 0x40, 0xa6, 0xb8, 0x54, 0x90, 0xb2, 0x71, 0x25, 0xf6, 0xe7, 0xde, 0x33,
0xca, 0xdd, 0x17, 0x64, 0xcf, 0x5a, 0x1c, 0xc2, 0x5d, 0xb1, 0x40, 0x8f, 0xcb, 0xb7, 0x7b, 0xb5,
0xf0, 0x74, 0x30, 0x4c, 0xda, 0xc1, 0x83, 0x8b, 0xe0, 0x98, 0x55, 0x5b, 0xe6, 0x83, 0x71, 0x72,
0x0b, 0x2c, 0x93, 0xec, 0x57, 0x9d, 0x81, 0x8e, 0xb0, 0x33, 0x16, 0x98, 0xfa, 0xf4, 0x91, 0xb3,
0xc8, 0xd7, 0x17, 0xf9, 0x8d, 0xb7, 0x3f, 0x01, 0x00, 0x00, 0xff, 0xff, 0x00, 0x80, 0x43, 0x93,
0x53, 0x01, 0x00, 0x00,
// 329 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x74, 0x91, 0xe1, 0x4b, 0xc2, 0x40,
0x18, 0xc6, 0xdb, 0xcc, 0xc0, 0xd3, 0x72, 0x1c, 0x21, 0xc3, 0x2f, 0x8d, 0xf2, 0xc3, 0xe8, 0xc3,
0x0e, 0x16, 0x85, 0x44, 0x08, 0xe6, 0xce, 0x0c, 0x49, 0x61, 0xeb, 0x43, 0xf5, 0x65, 0x9c, 0xf6,
0x22, 0x6b, 0xb6, 0x1b, 0xb7, 0xf3, 0x60, 0x7f, 0x6a, 0xff, 0x4d, 0x38, 0x13, 0xd3, 0xec, 0xdb,
0xbd, 0xef, 0xfd, 0xde, 0xe7, 0xb9, 0x7b, 0x5e, 0x54, 0x61, 0x69, 0xe4, 0xa4, 0x82, 0x4b, 0x8e,
0x75, 0xe5, 0x9e, 0x7f, 0xe9, 0xa8, 0x4e, 0x93, 0xa9, 0xc8, 0x53, 0x09, 0xef, 0xe3, 0xc9, 0x07,
0x4c, 0x25, 0x6e, 0xa1, 0x63, 0x58, 0xb7, 0x3c, 0x26, 0x99, 0xa9, 0x59, 0x9a, 0x5d, 0xf3, 0xb7,
0x9b, 0xf8, 0x14, 0x95, 0x63, 0xc8, 0x1f, 0x3d, 0x53, 0xb7, 0x34, 0xbb, 0xe2, 0xaf, 0x0a, 0xec,
0x20, 0xbc, 0xc1, 0xe8, 0x30, 0xe0, 0x0b, 0x31, 0x05, 0xb3, 0x54, 0x08, 0xec, 0xb9, 0xc1, 0x7d,
0x54, 0x65, 0x49, 0xc2, 0x25, 0x93, 0x11, 0x4f, 0x32, 0xf3, 0xd0, 0x2a, 0xd9, 0x55, 0xb7, 0xe5,
0x28, 0xd7, 0xd9, 0x79, 0x95, 0xd3, 0xdd, 0x60, 0x34, 0x91, 0x22, 0xf7, 0x7f, 0x0f, 0x62, 0x1f,
0x35, 0xfe, 0xaa, 0x3f, 0xe7, 0x29, 0x98, 0x65, 0x4b, 0xb3, 0x4f, 0xdc, 0xe6, 0x96, 0xe4, 0x16,
0xe1, 0xff, 0x33, 0xd9, 0xec, 0x20, 0x63, 0xd7, 0x14, 0x1b, 0xa8, 0x14, 0x43, 0x5e, 0x24, 0x52,
0xf1, 0x97, 0xc7, 0x65, 0x0e, 0x8a, 0xcd, 0x17, 0x50, 0xe4, 0x50, 0xf3, 0x57, 0xc5, 0xad, 0xde,
0xd6, 0x2e, 0x47, 0xa8, 0xb1, 0xdf, 0x11, 0xd7, 0x51, 0xb5, 0x4b, 0x83, 0xf0, 0xa1, 0xf7, 0x14,
0x0e, 0xe9, 0xab, 0x71, 0x80, 0x2f, 0xd0, 0xd9, 0x60, 0xe8, 0xf5, 0xc3, 0x60, 0xd0, 0x75, 0xaf,
0x6f, 0xc2, 0x97, 0xd1, 0x78, 0xd4, 0xa3, 0xe1, 0x9a, 0x09, 0x28, 0xf5, 0x0c, 0xed, 0xbe, 0xf3,
0x76, 0x17, 0xb7, 0x33, 0x27, 0xe2, 0x84, 0xa5, 0x51, 0x06, 0x42, 0x81, 0x20, 0x69, 0x3c, 0x23,
0x99, 0xe4, 0x82, 0xcd, 0x80, 0x14, 0xce, 0xe4, 0xe7, 0x33, 0x04, 0x12, 0x05, 0x73, 0x9e, 0x02,
0x89, 0x3f, 0x33, 0xe5, 0x12, 0xe5, 0x4e, 0x8e, 0x8a, 0xb5, 0x5f, 0x7d, 0x07, 0x00, 0x00, 0xff,
0xff, 0xcc, 0x0f, 0x2b, 0x2e, 0x03, 0x02, 0x00, 0x00,
}

View File

@ -28,9 +28,24 @@ message EncryptedObject {
// KeyID is the KMS key ID used for encryption operations.
string keyID = 2;
// EncryptedDEK is the encrypted DEK.
bytes encryptedDEK = 3;
// EncryptedDEKSource is the ciphertext of the source of the DEK used to encrypt the data stored in encryptedData.
// encryptedDEKSourceType defines the process of using the plaintext of this field to determine the aforementioned DEK.
bytes encryptedDEKSource = 3;
// Annotations is additional metadata that was provided by the KMS plugin.
map<string, bytes> annotations = 4;
// encryptedDEKSourceType defines the process of using the plaintext of encryptedDEKSource to determine the DEK.
EncryptedDEKSourceType encryptedDEKSourceType = 5;
}
enum EncryptedDEKSourceType {
// AES_GCM_KEY means that the plaintext of encryptedDEKSource is the DEK itself, with AES-GCM as the encryption algorithm.
AES_GCM_KEY = 0;
// HKDF_SHA256_XNONCE_AES_GCM_SEED means that the plaintext of encryptedDEKSource is the pseudo random key
// (referred to as the seed throughout the code) that is fed into HKDF expand. SHA256 is the hash algorithm
// and first 32 bytes of encryptedData are the info param. The first 32 bytes from the HKDF stream are used
// as the DEK with AES-GCM as the encryption algorithm.
HKDF_SHA256_XNONCE_AES_GCM_SEED = 1;
}

View File

@ -17,9 +17,11 @@ limitations under the License.
package value
import (
"errors"
"sync"
"time"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"k8s.io/component-base/metrics"
@ -59,7 +61,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "transformation_operations_total",
Help: "Total number of transformations.",
Help: "Total number of transformations. Successful transformation will have a status 'OK' and a varied status string when the transformation fails. This status and transformation_type fields may be used for alerting on encryption/decryption failure using transformation_type from_storage for decryption and to_storage for encryption",
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type", "transformer_prefix", "status"},
@ -112,7 +114,7 @@ func RegisterMetrics() {
// RecordTransformation records latencies and count of TransformFromStorage and TransformToStorage operations.
// Note that transformation_failures_total metric is deprecated, use transformation_operations_total instead.
func RecordTransformation(transformationType, transformerPrefix string, elapsed time.Duration, err error) {
transformerOperationsTotal.WithLabelValues(transformationType, transformerPrefix, status.Code(err).String()).Inc()
transformerOperationsTotal.WithLabelValues(transformationType, transformerPrefix, getErrorCode(err)).Inc()
if err == nil {
transformerLatencies.WithLabelValues(transformationType, transformerPrefix).Observe(elapsed.Seconds())
@ -138,3 +140,23 @@ func RecordDataKeyGeneration(start time.Time, err error) {
func sinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
type gRPCError interface {
GRPCStatus() *status.Status
}
func getErrorCode(err error) string {
if err == nil {
return codes.OK.String()
}
// handle errors wrapped with fmt.Errorf and similar
var s gRPCError
if errors.As(err, &s) {
return s.GRPCStatus().Code().String()
}
// This is not gRPC error. The operation must have failed before gRPC
// method was called, otherwise we would get gRPC error.
return "unknown-non-grpc"
}

View File

@ -23,7 +23,10 @@ import (
"fmt"
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/errors"
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/klog/v2"
)
func init() {
@ -39,17 +42,30 @@ type Context interface {
AuthenticatedData() []byte
}
// Transformer allows a value to be transformed before being read from or written to the underlying store. The methods
// must be able to undo the transformation caused by the other.
type Transformer interface {
type Read interface {
// TransformFromStorage may transform the provided data from its underlying storage representation or return an error.
// Stale is true if the object on disk is stale and a write to etcd should be issued, even if the contents of the object
// have not changed.
TransformFromStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, stale bool, err error)
}
type Write interface {
// TransformToStorage may transform the provided data into the appropriate form in storage or return an error.
TransformToStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, err error)
}
// Transformer allows a value to be transformed before being read from or written to the underlying store. The methods
// must be able to undo the transformation caused by the other.
type Transformer interface {
Read
Write
}
// ResourceTransformers returns a transformer for the provided resource.
type ResourceTransformers interface {
TransformerForResource(resource schema.GroupResource) Transformer
}
// DefaultContext is a simple implementation of Context for a slice of bytes.
type DefaultContext []byte
@ -144,6 +160,7 @@ func (t *prefixTransformers) TransformFromStorage(ctx context.Context, data []by
}
}
if err := errors.Reduce(errors.NewAggregate(errs)); err != nil {
logTransformErr(ctx, err, "failed to decrypt data")
return nil, false, err
}
RecordTransformation("from_storage", "unknown", time.Since(start), t.err)
@ -157,6 +174,7 @@ func (t *prefixTransformers) TransformToStorage(ctx context.Context, data []byte
result, err := transformer.Transformer.TransformToStorage(ctx, data, dataCtx)
RecordTransformation("to_storage", string(transformer.Prefix), time.Since(start), err)
if err != nil {
logTransformErr(ctx, err, "failed to encrypt data")
return nil, err
}
prefixedData := make([]byte, len(transformer.Prefix), len(result)+len(transformer.Prefix))
@ -164,3 +182,32 @@ func (t *prefixTransformers) TransformToStorage(ctx context.Context, data []byte
prefixedData = append(prefixedData, result...)
return prefixedData, nil
}
func logTransformErr(ctx context.Context, err error, message string) {
requestInfo := getRequestInfoFromContext(ctx)
if klogLevel6 := klog.V(6); klogLevel6.Enabled() {
klogLevel6.InfoSDepth(
1,
message,
"err", err,
"group", requestInfo.APIGroup,
"version", requestInfo.APIVersion,
"resource", requestInfo.Resource,
"subresource", requestInfo.Subresource,
"verb", requestInfo.Verb,
"namespace", requestInfo.Namespace,
"name", requestInfo.Name,
)
return
}
klog.ErrorSDepth(1, err, message)
}
func getRequestInfoFromContext(ctx context.Context) *genericapirequest.RequestInfo {
if reqInfo, found := genericapirequest.RequestInfoFrom(ctx); found {
return reqInfo
}
return &genericapirequest.RequestInfo{}
}