vendor update for CSI 0.3.0

This commit is contained in:
gman
2018-07-18 16:47:22 +02:00
parent 6f484f92fc
commit 8ea659f0d5
6810 changed files with 438061 additions and 193861 deletions

View File

@ -11,6 +11,8 @@ go_test(
srcs = [
"eviction_manager_test.go",
"helpers_test.go",
"memory_threshold_notifier_test.go",
"mock_threshold_notifier_test.go",
],
embed = [":go_default_library"],
deps = [
@ -20,7 +22,7 @@ go_test(
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/quota:go_default_library",
"//vendor/github.com/stretchr/testify/mock:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
@ -37,6 +39,7 @@ go_library(
"doc.go",
"eviction_manager.go",
"helpers.go",
"memory_threshold_notifier.go",
"types.go",
] + select({
"@io_bazel_rules_go//go/platform:android": [
@ -77,6 +80,7 @@ go_library(
importpath = "k8s.io/kubernetes/pkg/kubelet/eviction",
deps = [
"//pkg/api/v1/resource:go_default_library",
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/apis/core/v1/helper/qos:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/apis/stats/v1alpha1:go_default_library",
@ -88,13 +92,13 @@ go_library(
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/scheduler/algorithm:go_default_library",
"//pkg/scheduler/util:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/clock:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
] + select({

View File

@ -38,8 +38,6 @@ const (
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
// SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation
SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available"
// SignalPIDAvailable is amount of PID available for pod allocation
SignalPIDAvailable Signal = "pid.available"
)
@ -60,13 +58,11 @@ const (
// from either above or below, never both). There is thus no reason to expose the
// operator in the Kubelet's public API. Instead, we internally map signal types to operators.
var OpForSignal = map[Signal]ThresholdOperator{
SignalMemoryAvailable: OpLessThan,
SignalNodeFsAvailable: OpLessThan,
SignalNodeFsInodesFree: OpLessThan,
SignalImageFsAvailable: OpLessThan,
SignalImageFsInodesFree: OpLessThan,
SignalAllocatableMemoryAvailable: OpLessThan,
SignalAllocatableNodeFsAvailable: OpLessThan,
SignalMemoryAvailable: OpLessThan,
SignalNodeFsAvailable: OpLessThan,
SignalNodeFsInodesFree: OpLessThan,
SignalImageFsAvailable: OpLessThan,
SignalImageFsInodesFree: OpLessThan,
}
// ThresholdValue is a value holder that abstracts literal versus percentage based quantity

View File

@ -27,14 +27,13 @@ import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/record"
apiv1resource "k8s.io/kubernetes/pkg/api/v1/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
@ -42,6 +41,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/server/stats"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/scheduler/algorithm"
)
const (
@ -77,16 +77,18 @@ type managerImpl struct {
thresholdsFirstObservedAt thresholdsObservedAt
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
thresholdsMet []evictionapi.Threshold
// resourceToRankFunc maps a resource to ranking function for that resource.
resourceToRankFunc map[v1.ResourceName]rankFunc
// resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
resourceToNodeReclaimFuncs map[v1.ResourceName]nodeReclaimFuncs
// signalToRankFunc maps a resource to ranking function for that resource.
signalToRankFunc map[evictionapi.Signal]rankFunc
// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
// last observations from synchronize
lastObservations signalObservations
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
notifiersInitialized bool
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
dedicatedImageFs *bool
// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
thresholdNotifiers []ThresholdNotifier
// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
thresholdsLastUpdated time.Time
}
// ensure it implements the required interface
@ -115,6 +117,7 @@ func NewManager(
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
dedicatedImageFs: nil,
thresholdNotifiers: []ThresholdNotifier{},
}
return manager, manager
}
@ -137,19 +140,46 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
if notBestEffort {
return lifecycle.PodAdmitResult{Admit: true}
}
// When node has memory pressure and TaintNodesByCondition is enabled, check BestEffort Pod's toleration:
// admit it if tolerates memory pressure taint, fail for other tolerations, e.g. OutOfDisk.
if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) &&
v1helper.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
Key: algorithm.TaintNodeMemoryPressure,
Effect: v1.TaintEffectNoSchedule,
}) {
return lifecycle.PodAdmitResult{Admit: true}
}
}
// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
glog.Warningf("Failed to admit pod %s - node has conditions: %v", format.Pod(attrs.Pod), m.nodeConditions)
return lifecycle.PodAdmitResult{
Admit: false,
Reason: reason,
Message: fmt.Sprintf(message, m.nodeConditions),
Reason: Reason,
Message: fmt.Sprintf(nodeLowMessageFmt, m.nodeConditions),
}
}
// Start starts the control loop to observe and response to low compute resources.
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
thresholdHandler := func(message string) {
glog.Infof(message)
m.synchronize(diskInfoProvider, podFunc)
}
if m.config.KernelMemcgNotification {
for _, threshold := range m.config.Thresholds {
if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
if err != nil {
glog.Warningf("eviction manager: failed to create memory threshold notifier: %v", err)
} else {
go notifier.Start()
m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
}
}
}
}
// start the eviction manager monitoring
go func() {
for {
@ -184,39 +214,6 @@ func (m *managerImpl) IsUnderPIDPressure() bool {
return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
}
func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error {
for _, threshold := range thresholds {
if threshold.Signal != evictionapi.SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) {
continue
}
observed, found := observations[evictionapi.SignalMemoryAvailable]
if !found {
continue
}
cgroups, err := cm.GetCgroupSubsystems()
if err != nil {
return err
}
// TODO add support for eviction from --cgroup-root
cgpath, found := cgroups.MountPoints["memory"]
if !found || len(cgpath) == 0 {
return fmt.Errorf("memory cgroup mount point not found")
}
attribute := "memory.usage_in_bytes"
quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
usageThreshold := resource.NewQuantity(observed.capacity.Value(), resource.DecimalSI)
usageThreshold.Sub(*quantity)
description := fmt.Sprintf("<%s available", formatThresholdValue(threshold.Value))
memcgThresholdNotifier, err := NewMemCGThresholdNotifier(cgpath, attribute, usageThreshold.String(), description, handler)
if err != nil {
return err
}
go memcgThresholdNotifier.Start(wait.NeverStop)
return nil
}
return nil
}
// synchronize is the main control loop that enforces eviction thresholds.
// Returns the pod that was killed, or nil if no pod was killed.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) []*v1.Pod {
@ -235,8 +232,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
return nil
}
m.dedicatedImageFs = &hasImageFs
m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
m.signalToRankFunc = buildSignalToRankFunc(hasImageFs)
m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
}
activePods := podFunc()
@ -247,33 +244,19 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
return nil
}
if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
m.thresholdsLastUpdated = m.clock.Now()
for _, notifier := range m.thresholdNotifiers {
if err := notifier.UpdateThreshold(summary); err != nil {
glog.Warningf("eviction manager: failed to update %s: %v", notifier.Description(), err)
}
}
}
// make observations and get a function to derive pod usage stats relative to those observations.
observations, statsFunc := makeSignalObservations(summary)
debugLogObservations("observations", observations)
// attempt to create a threshold notifier to improve eviction response time
if m.config.KernelMemcgNotification && !m.notifiersInitialized {
glog.Infof("eviction manager attempting to integrate with kernel memcg notification api")
m.notifiersInitialized = true
// start soft memory notification
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) {
glog.Infof("soft memory eviction threshold crossed at %s", desc)
// TODO wait grace period for soft memory limit
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)
}
// start hard memory notification
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) {
glog.Infof("hard memory eviction threshold crossed at %s", desc)
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
}
}
// determine the set of thresholds met independent of grace period
thresholds = thresholdsMet(thresholds, observations, false)
debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
@ -330,26 +313,26 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}
}
// determine the set of resources under starvation
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
if len(thresholds) == 0 {
glog.V(3).Infof("eviction manager: no resources are starved")
return nil
}
// rank the resources to reclaim by eviction priority
sort.Sort(byEvictionPriority(starvedResources))
resourceToReclaim := starvedResources[0]
// rank the thresholds by eviction priority
sort.Sort(byEvictionPriority(thresholds))
thresholdToReclaim := thresholds[0]
resourceToReclaim, found := signalToResource[thresholdToReclaim.Signal]
if !found {
glog.V(3).Infof("eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold.", thresholdToReclaim.Signal)
return nil
}
glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
// determine if this is a soft or hard eviction associated with the resource
softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim)
// record an event about the resources we are now attempting to reclaim via eviction
m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
if m.reclaimNodeLevelResources(resourceToReclaim) {
if m.reclaimNodeLevelResources(thresholdToReclaim.Signal, resourceToReclaim) {
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
return nil
}
@ -357,9 +340,9 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
// rank the pods for eviction
rank, ok := m.resourceToRankFunc[resourceToReclaim]
rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
if !ok {
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
glog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal)
return nil
}
@ -385,30 +368,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
// do not evict such pods. Static pods are not re-admitted after evictions.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) {
continue
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceToReclaim),
Reason: reason,
}
// record that we are evicting the pod
m.recorder.Eventf(pod, v1.EventTypeWarning, reason, fmt.Sprintf(message, resourceToReclaim))
gracePeriodOverride := int64(0)
if softEviction {
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
}
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc)
if m.evictPod(pod, gracePeriodOverride, message, annotations) {
return []*v1.Pod{pod}
}
return []*v1.Pod{pod}
}
glog.Infof("eviction manager: unable to evict any pods from the node")
return nil
@ -416,13 +383,15 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
timeout := m.clock.NewTimer(podCleanupTimeout)
tick := m.clock.Tick(podCleanupPollFreq)
defer timeout.Stop()
ticker := m.clock.NewTicker(podCleanupPollFreq)
defer ticker.Stop()
for {
select {
case <-timeout.C():
glog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods))
return
case <-tick:
case <-ticker.C():
for i, pod := range pods {
if !podCleanedUpFunc(pod) {
break
@ -437,8 +406,8 @@ func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceName) bool {
nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim]
func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
for _, nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
if err := nodeReclaimFunc(); err != nil {
@ -509,7 +478,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
// the emptyDir usage exceeds the size limit, evict the pod
return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String()))
return m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil)
}
}
}
@ -537,10 +506,11 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat
return false
}
podEphemeralStorageTotalUsage.Add(podEphemeralUsage[resourceDisk])
if podEphemeralStorageTotalUsage.Cmp(podLimits[v1.ResourceEphemeralStorage]) > 0 {
podEphemeralStorageTotalUsage.Add(podEphemeralUsage[v1.ResourceEphemeralStorage])
podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
// the total usage of pod exceeds the total size limit of containers, evict the pod
return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("pod ephemeral local storage usage exceeds the total limit of containers %v", podLimits[v1.ResourceEphemeralStorage]))
return m.evictPod(pod, 0, fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String()), nil)
}
return false
}
@ -562,7 +532,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P
if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("container's ephemeral local storage usage exceeds the limit %q", ephemeralStorageThreshold.String()))
return m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil)
}
}
@ -570,21 +540,24 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P
return false
}
func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool {
func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string) bool {
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
// do not evict such pods. Static pods are not re-admitted after evictions.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) {
glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod))
glog.Errorf("eviction manager: cannot evict a critical static pod %s", format.Pod(pod))
return false
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceName),
Reason: reason,
Message: evictMsg,
Reason: Reason,
}
// record that we are evicting the pod
m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg)
gracePeriod := int64(0)
err := m.killPodFunc(pod, status, &gracePeriod)
m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
} else {

View File

@ -17,6 +17,7 @@ limitations under the License.
package eviction
import (
"fmt"
"testing"
"time"
@ -1434,3 +1435,74 @@ func TestAllocatableMemoryPressure(t *testing.T) {
}
}
}
func TestUpdateMemcgThreshold(t *testing.T) {
activePodsFunc := func() []*v1.Pod {
return []*v1.Pod{}
}
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
},
PodCgroupRoot: "kubepods",
}
summaryProvider := &fakeSummaryProvider{result: makeMemoryStats("2Gi", map[*v1.Pod]statsapi.PodStats{})}
thresholdNotifier := &MockThresholdNotifier{}
thresholdNotifier.On("UpdateThreshold", summaryProvider.result).Return(nil).Twice()
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: diskGC,
containerGC: diskGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
thresholdNotifiers: []ThresholdNotifier{thresholdNotifier},
}
manager.synchronize(diskInfoProvider, activePodsFunc)
// The UpdateThreshold method should have been called once, since this is the first run.
thresholdNotifier.AssertNumberOfCalls(t, "UpdateThreshold", 1)
manager.synchronize(diskInfoProvider, activePodsFunc)
// The UpdateThreshold method should not have been called again, since not enough time has passed
thresholdNotifier.AssertNumberOfCalls(t, "UpdateThreshold", 1)
fakeClock.Step(2 * notifierRefreshInterval)
manager.synchronize(diskInfoProvider, activePodsFunc)
// The UpdateThreshold method should be called again since enough time has passed
thresholdNotifier.AssertNumberOfCalls(t, "UpdateThreshold", 2)
// new memory threshold notifier that returns an error
thresholdNotifier = &MockThresholdNotifier{}
thresholdNotifier.On("UpdateThreshold", summaryProvider.result).Return(fmt.Errorf("error updating threshold"))
thresholdNotifier.On("Description").Return("mock thresholdNotifier").Once()
manager.thresholdNotifiers = []ThresholdNotifier{thresholdNotifier}
fakeClock.Step(2 * notifierRefreshInterval)
manager.synchronize(diskInfoProvider, activePodsFunc)
// The UpdateThreshold method should be called because at least notifierRefreshInterval time has passed.
// The Description method should be called because UpdateThreshold returned an error
thresholdNotifier.AssertNumberOfCalls(t, "UpdateThreshold", 1)
thresholdNotifier.AssertNumberOfCalls(t, "Description", 1)
}

View File

@ -36,22 +36,26 @@ import (
const (
unsupportedEvictionSignal = "unsupported eviction signal %v"
// the reason reported back in status.
reason = "Evicted"
// the message associated with the reason.
message = "The node was low on resource: %v."
// disk, in bytes. internal to this module, used to account for local disk usage.
resourceDisk v1.ResourceName = "disk"
// Reason is the reason reported back in status.
Reason = "Evicted"
// nodeLowMessageFmt is the message for evictions due to resource pressure.
nodeLowMessageFmt = "The node was low on resource: %v. "
// containerMessageFmt provides additional information for containers exceeding requests
containerMessageFmt = "Container %s was using %s, which exceeds its request of %s. "
// containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit
containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. "
// podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit
podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. "
// emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit
emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. "
// inodes, number. internal to this module, used to account for local disk inode consumption.
resourceInodes v1.ResourceName = "inodes"
// imagefs, in bytes. internal to this module, used to account for local image filesystem usage.
resourceImageFs v1.ResourceName = "imagefs"
// imagefs inodes, number. internal to this module, used to account for local image filesystem inodes.
resourceImageFsInodes v1.ResourceName = "imagefsInodes"
// nodefs, in bytes. internal to this module, used to account for local node root filesystem usage.
resourceNodeFs v1.ResourceName = "nodefs"
// nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
resourceNodeFsInodes v1.ResourceName = "nodefsInodes"
// OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests
OffendingContainersKey = "offending_containers"
// OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests
OffendingContainersUsageKey = "offending_containers_usage"
// StarvedResourceKey is the key for the starved resource in eviction event annotations
StarvedResourceKey = "starved_resource"
)
var (
@ -59,8 +63,6 @@ var (
signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType
// signalToResource maps a Signal to its associated Resource.
signalToResource map[evictionapi.Signal]v1.ResourceName
// resourceClaimToSignal maps a Resource that can be reclaimed to its associated Signal
resourceClaimToSignal map[v1.ResourceName][]evictionapi.Signal
)
func init() {
@ -78,17 +80,10 @@ func init() {
signalToResource = map[evictionapi.Signal]v1.ResourceName{}
signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes
// maps resource to signals (the following resource could be reclaimed)
resourceClaimToSignal = map[v1.ResourceName][]evictionapi.Signal{}
resourceClaimToSignal[resourceNodeFs] = []evictionapi.Signal{evictionapi.SignalNodeFsAvailable}
resourceClaimToSignal[resourceImageFs] = []evictionapi.Signal{evictionapi.SignalImageFsAvailable}
resourceClaimToSignal[resourceNodeFsInodes] = []evictionapi.Signal{evictionapi.SignalNodeFsInodesFree}
resourceClaimToSignal[resourceImageFsInodes] = []evictionapi.Signal{evictionapi.SignalImageFsInodesFree}
signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes
}
// validSignal returns true if the signal is supported.
@ -305,10 +300,10 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
// inodeUsage converts inodes consumed into a resource quantity.
func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
if fsStats == nil || fsStats.InodesUsed == nil {
return &resource.Quantity{Format: resource.BinarySI}
return &resource.Quantity{Format: resource.DecimalSI}
}
usage := int64(*fsStats.InodesUsed)
return resource.NewQuantity(usage, resource.BinarySI)
return resource.NewQuantity(usage, resource.DecimalSI)
}
// memoryUsage converts working set into a resource quantity.
@ -338,7 +333,7 @@ func localVolumeNames(pod *v1.Pod) []string {
// containerUsage aggregates container disk usage and inode consumption for the specified stats to measure.
func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
for _, container := range podStats.Containers {
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
disk.Add(*diskUsage(container.Rootfs))
@ -350,15 +345,15 @@ func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1
}
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}
}
// podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure.
func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
for _, volumeName := range volumeNames {
for _, volumeStats := range podStats.VolumeStats {
if volumeStats.Name == volumeName {
@ -369,29 +364,29 @@ func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.Re
}
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}
}
// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
containerUsageList := containerUsage(podStats, statsToMeasure)
disk.Add(containerUsageList[resourceDisk])
disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
inodes.Add(containerUsageList[resourceInodes])
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
volumeNames := localVolumeNames(pod)
podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
disk.Add(podLocalVolumeUsageList[resourceDisk])
disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
inodes.Add(podLocalVolumeUsageList[resourceInodes])
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}, nil
}
@ -421,21 +416,21 @@ func localEphemeralVolumeNames(pod *v1.Pod) []string {
// podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure.
func podLocalEphemeralStorageUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
containerUsageList := containerUsage(podStats, statsToMeasure)
disk.Add(containerUsageList[resourceDisk])
disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
inodes.Add(containerUsageList[resourceInodes])
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
volumeNames := localEphemeralVolumeNames(pod)
podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
disk.Add(podLocalVolumeUsageList[resourceDisk])
disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
inodes.Add(podLocalVolumeUsageList[resourceInodes])
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}, nil
}
@ -600,7 +595,7 @@ func memory(stats statsFunc) cmpFunc {
// max(max of init container requests, sum of container requests)
func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
containerValue := resource.Quantity{Format: resource.BinarySI}
if resourceName == resourceDisk && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
// if the local storage capacity isolation feature gate is disabled, pods request 0 disk
return containerValue
}
@ -608,7 +603,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
switch resourceName {
case v1.ResourceMemory:
containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.Memory())
case resourceDisk:
case v1.ResourceEphemeralStorage:
containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.StorageEphemeral())
}
}
@ -619,7 +614,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.Memory()) < 0 {
initValue = *pod.Spec.InitContainers[i].Resources.Requests.Memory()
}
case resourceDisk:
case v1.ResourceEphemeralStorage:
if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()) < 0 {
initValue = *pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()
}
@ -676,9 +671,9 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.Resou
// adjust p1, p2 usage relative to the request (if any)
p1Disk := p1Usage[diskResource]
p2Disk := p2Usage[diskResource]
p1Request := podRequest(p1, resourceDisk)
p1Request := podRequest(p1, v1.ResourceEphemeralStorage)
p1Disk.Sub(p1Request)
p2Request := podRequest(p2, resourceDisk)
p2Request := podRequest(p2, v1.ResourceEphemeralStorage)
p2Disk.Sub(p2Request)
// prioritize evicting the pod which has the larger consumption of disk
return p2Disk.Cmp(p1Disk)
@ -711,14 +706,15 @@ func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.Resour
}
// byEvictionPriority implements sort.Interface for []v1.ResourceName.
type byEvictionPriority []v1.ResourceName
type byEvictionPriority []evictionapi.Threshold
func (a byEvictionPriority) Len() int { return len(a) }
func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// Less ranks memory before all other resources.
// Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last
func (a byEvictionPriority) Less(i, j int) bool {
return a[i] == v1.ResourceMemory
_, jSignalHasResource := signalToResource[a[j].Signal]
return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource
}
// makeSignalObservations derives observations using the specified summary provider.
@ -756,8 +752,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat
}
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI),
time: nodeFs.Time,
}
}
@ -772,8 +768,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat
}
if imageFs.InodesFree != nil && imageFs.Inodes != nil {
result[evictionapi.SignalImageFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI),
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI),
time: imageFs.Time,
}
}
@ -1001,82 +997,102 @@ func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.Threshold
return a.Percentage == b.Percentage
}
// getStarvedResources returns the set of resources that are starved based on thresholds met.
func getStarvedResources(thresholds []evictionapi.Threshold) []v1.ResourceName {
results := []v1.ResourceName{}
for _, threshold := range thresholds {
if starvedResource, found := signalToResource[threshold.Signal]; found {
results = append(results, starvedResource)
}
}
return results
}
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
func isSoftEvictionThresholds(thresholds []evictionapi.Threshold, starvedResource v1.ResourceName) bool {
for _, threshold := range thresholds {
if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
continue
}
if isHardEvictionThreshold(threshold) {
return false
}
}
return true
}
// isHardEvictionThreshold returns true if eviction should immediately occur
func isHardEvictionThreshold(threshold evictionapi.Threshold) bool {
return threshold.GracePeriod == time.Duration(0)
}
// buildResourceToRankFunc returns ranking functions associated with resources
func buildResourceToRankFunc(withImageFs bool) map[v1.ResourceName]rankFunc {
resourceToRankFunc := map[v1.ResourceName]rankFunc{
v1.ResourceMemory: rankMemoryPressure,
func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool {
return threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable
}
// buildSignalToRankFunc returns ranking functions associated with resources
func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc {
signalToRankFunc := map[evictionapi.Signal]rankFunc{
evictionapi.SignalMemoryAvailable: rankMemoryPressure,
evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure,
}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
} else {
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
// since imagefs and nodefs share a common device, they share common ranking functions.
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
}
return resourceToRankFunc
return signalToRankFunc
}
// PodIsEvicted returns true if the reported pod status is due to an eviction.
func PodIsEvicted(podStatus v1.PodStatus) bool {
return podStatus.Phase == v1.PodFailed && podStatus.Reason == reason
return podStatus.Phase == v1.PodFailed && podStatus.Reason == Reason
}
// buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildResourceToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
resourceToReclaimFunc := map[v1.ResourceName]nodeReclaimFuncs{}
// buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs {
signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{}
signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{}
signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{}
// with an imagefs, imagefs pressure should delete unused images
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
} else {
// without an imagefs, nodefs pressure should delete logs, and unused images
// since imagefs and nodefs share a common device, they share common reclaim functions
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
}
return resourceToReclaimFunc
return signalToReclaimFunc
}
// evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction
func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc) (message string, annotations map[string]string) {
annotations = make(map[string]string)
message = fmt.Sprintf(nodeLowMessageFmt, resourceToReclaim)
containers := []string{}
containerUsage := []string{}
podStats, ok := stats(pod)
if !ok {
return
}
for _, containerStats := range podStats.Containers {
for _, container := range pod.Spec.Containers {
if container.Name == containerStats.Name {
requests := container.Resources.Requests[resourceToReclaim]
var usage *resource.Quantity
switch resourceToReclaim {
case v1.ResourceEphemeralStorage:
if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil {
usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI)
}
case v1.ResourceMemory:
if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil {
usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI)
}
}
if usage != nil && usage.Cmp(requests) > 0 {
message += fmt.Sprintf(containerMessageFmt, container.Name, usage.String(), requests.String())
containers = append(containers, container.Name)
containerUsage = append(containerUsage, usage.String())
}
}
}
}
annotations[OffendingContainersKey] = strings.Join(containers, ",")
annotations[OffendingContainersUsageKey] = strings.Join(containerUsage, ",")
annotations[StarvedResourceKey] = string(resourceToReclaim)
return
}

View File

@ -19,6 +19,7 @@ package eviction
import (
"fmt"
"reflect"
"sort"
"testing"
"time"
@ -27,12 +28,10 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/features"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/quota"
)
func quantityMustParse(value string) *resource.Quantity {
@ -468,7 +467,7 @@ func TestOrderedByExceedsRequestDisk(t *testing.T) {
return result, found
}
pods := []*v1.Pod{below, exceeds}
orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{exceeds, below}
for i := range expected {
@ -582,7 +581,7 @@ func TestOrderedbyDisk(t *testing.T) {
return result, found
}
pods := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6}
for i := range expected {
if pods[i] != expected[i] {
@ -649,7 +648,7 @@ func TestOrderedbyDiskDisableLocalStorage(t *testing.T) {
return result, found
}
pods := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6}
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{pod5, pod3, pod1, pod6, pod4, pod2}
for i := range expected {
if pods[i] != expected[i] {
@ -778,7 +777,7 @@ func TestOrderedByPriorityDisk(t *testing.T) {
pods := []*v1.Pod{pod8, pod7, pod6, pod5, pod4, pod3, pod2, pod1}
expected := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6, pod7, pod8}
fsStatsToMeasure := []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}
orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, resourceDisk), priority, disk(statsFn, fsStatsToMeasure, resourceDisk)).Sort(pods)
orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage), priority, disk(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage)).Sort(pods)
for i := range expected {
if pods[i] != expected[i] {
t.Errorf("Expected pod[%d]: %s, but got: %s", i, expected[i].Name, pods[i].Name)
@ -930,6 +929,80 @@ func TestOrderedByPriorityMemory(t *testing.T) {
}
}
func TestSortByEvictionPriority(t *testing.T) {
for _, tc := range []struct {
name string
thresholds []evictionapi.Threshold
expected []evictionapi.Threshold
}{
{
name: "empty threshold list",
thresholds: []evictionapi.Threshold{},
expected: []evictionapi.Threshold{},
},
{
name: "memory first, PID last",
thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalPIDAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalMemoryAvailable,
},
},
expected: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalPIDAvailable,
},
},
},
{
name: "allocatable memory first, PID last",
thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalPIDAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
},
},
expected: []evictionapi.Threshold{
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalPIDAvailable,
},
},
},
} {
t.Run(tc.name, func(t *testing.T) {
sort.Sort(byEvictionPriority(tc.thresholds))
for i := range tc.expected {
if tc.thresholds[i].Signal != tc.expected[i].Signal {
t.Errorf("At index %d, expected threshold with signal %s, but got %s", i, tc.expected[i].Signal, tc.thresholds[i].Signal)
}
}
})
}
}
type fakeSummaryProvider struct {
result *statsapi.Summary
}
@ -1620,47 +1693,6 @@ func TestHasNodeConditions(t *testing.T) {
}
}
func TestGetStarvedResources(t *testing.T) {
testCases := map[string]struct {
inputs []evictionapi.Threshold
result []v1.ResourceName
}{
"memory.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalMemoryAvailable},
},
result: []v1.ResourceName{v1.ResourceMemory},
},
"imagefs.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalImageFsAvailable},
},
result: []v1.ResourceName{resourceImageFs},
},
"nodefs.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalNodeFsAvailable},
},
result: []v1.ResourceName{resourceNodeFs},
},
}
var internalResourceNames = func(in []v1.ResourceName) []api.ResourceName {
var out []api.ResourceName
for _, name := range in {
out = append(out, api.ResourceName(name))
}
return out
}
for testName, testCase := range testCases {
actual := getStarvedResources(testCase.inputs)
actualSet := quota.ToSet(internalResourceNames(actual))
expectedSet := quota.ToSet(internalResourceNames(testCase.result))
if !actualSet.Equal(expectedSet) {
t.Errorf("Test case: %s, expected: %v, actual: %v", testName, expectedSet, actualSet)
}
}
}
func TestParsePercentage(t *testing.T) {
testCases := map[string]struct {
hasError bool

View File

@ -0,0 +1,135 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"fmt"
"time"
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/api/resource"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const (
memoryUsageAttribute = "memory.usage_in_bytes"
// this prevents constantly updating the memcg notifier if synchronize
// is run frequently.
notifierRefreshInterval = 10 * time.Second
)
type memoryThresholdNotifier struct {
threshold evictionapi.Threshold
cgroupPath string
events chan struct{}
factory NotifierFactory
handler func(string)
notifier CgroupNotifier
}
var _ ThresholdNotifier = &memoryThresholdNotifier{}
// NewMemoryThresholdNotifier creates a ThresholdNotifier which is designed to respond to the given threshold.
// UpdateThreshold must be called once before the threshold will be active.
func NewMemoryThresholdNotifier(threshold evictionapi.Threshold, cgroupRoot string, factory NotifierFactory, handler func(string)) (ThresholdNotifier, error) {
cgroups, err := cm.GetCgroupSubsystems()
if err != nil {
return nil, err
}
cgpath, found := cgroups.MountPoints["memory"]
if !found || len(cgpath) == 0 {
return nil, fmt.Errorf("memory cgroup mount point not found")
}
if isAllocatableEvictionThreshold(threshold) {
// for allocatable thresholds, point the cgroup notifier at the allocatable cgroup
cgpath += cgroupRoot
}
return &memoryThresholdNotifier{
threshold: threshold,
cgroupPath: cgpath,
events: make(chan struct{}),
handler: handler,
factory: factory,
}, nil
}
func (m *memoryThresholdNotifier) Start() {
glog.Infof("eviction manager: created %s", m.Description())
for range m.events {
m.handler(fmt.Sprintf("eviction manager: %s crossed", m.Description()))
}
}
func (m *memoryThresholdNotifier) UpdateThreshold(summary *statsapi.Summary) error {
memoryStats := summary.Node.Memory
if isAllocatableEvictionThreshold(m.threshold) {
allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods)
if err != nil {
return err
}
memoryStats = allocatableContainer.Memory
}
if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil {
return fmt.Errorf("summary was incomplete. Expected MemoryStats and all subfields to be non-nil, but got %+v", memoryStats)
}
// Set threshold on usage to capacity - eviction_hard + inactive_file,
// since we want to be notified when working_set = capacity - eviction_hard
inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI)
capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI)
evictionThresholdQuantity := evictionapi.GetThresholdQuantity(m.threshold.Value, capacity)
memcgThreshold := capacity.DeepCopy()
memcgThreshold.Sub(*evictionThresholdQuantity)
memcgThreshold.Add(*inactiveFile)
glog.V(3).Infof("eviction manager: setting %s to %s\n", m.Description(), memcgThreshold.String())
if m.notifier != nil {
m.notifier.Stop()
}
newNotifier, err := m.factory.NewCgroupNotifier(m.cgroupPath, memoryUsageAttribute, memcgThreshold.Value())
if err != nil {
return err
}
m.notifier = newNotifier
go m.notifier.Start(m.events)
return nil
}
func (m *memoryThresholdNotifier) Description() string {
var hard, allocatable string
if isHardEvictionThreshold(m.threshold) {
hard = "hard "
} else {
hard = "soft "
}
if isAllocatableEvictionThreshold(m.threshold) {
allocatable = "allocatable "
}
return fmt.Sprintf("%s%smemory eviction threshold", hard, allocatable)
}
var _ NotifierFactory = &CgroupNotifierFactory{}
// CgroupNotifierFactory knows how to make CgroupNotifiers which integrate with the kernel
type CgroupNotifierFactory struct{}
// NewCgroupNotifier implements the NotifierFactory interface
func (n *CgroupNotifierFactory) NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error) {
return NewCgroupNotifier(path, attribute, threshold)
}

View File

@ -0,0 +1,270 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
"fmt"
"strings"
"sync"
"testing"
"time"
"k8s.io/apimachinery/pkg/api/resource"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const testCgroupPath = "/sys/fs/cgroups/memory"
func nodeSummary(available, workingSet, usage resource.Quantity, allocatable bool) *statsapi.Summary {
availableBytes := uint64(available.Value())
workingSetBytes := uint64(workingSet.Value())
usageBytes := uint64(usage.Value())
memoryStats := statsapi.MemoryStats{
AvailableBytes: &availableBytes,
WorkingSetBytes: &workingSetBytes,
UsageBytes: &usageBytes,
}
if allocatable {
return &statsapi.Summary{
Node: statsapi.NodeStats{
SystemContainers: []statsapi.ContainerStats{
{
Name: statsapi.SystemContainerPods,
Memory: &memoryStats,
},
},
},
}
}
return &statsapi.Summary{
Node: statsapi.NodeStats{
Memory: &memoryStats,
},
}
}
func newTestMemoryThresholdNotifier(threshold evictionapi.Threshold, factory NotifierFactory, handler func(string)) *memoryThresholdNotifier {
return &memoryThresholdNotifier{
threshold: threshold,
cgroupPath: testCgroupPath,
events: make(chan struct{}),
factory: factory,
handler: handler,
}
}
func TestUpdateThreshold(t *testing.T) {
testCases := []struct {
description string
available resource.Quantity
workingSet resource.Quantity
usage resource.Quantity
evictionThreshold evictionapi.Threshold
expectedThreshold resource.Quantity
updateThresholdErr error
expectErr bool
}{
{
description: "node level threshold",
available: resource.MustParse("3Gi"),
usage: resource.MustParse("2Gi"),
workingSet: resource.MustParse("1Gi"),
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
expectedThreshold: resource.MustParse("4Gi"),
updateThresholdErr: nil,
expectErr: false,
},
{
description: "allocatable threshold",
available: resource.MustParse("4Gi"),
usage: resource.MustParse("3Gi"),
workingSet: resource.MustParse("1Gi"),
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
expectedThreshold: resource.MustParse("6Gi"),
updateThresholdErr: nil,
expectErr: false,
},
{
description: "error updating node level threshold",
available: resource.MustParse("3Gi"),
usage: resource.MustParse("2Gi"),
workingSet: resource.MustParse("1Gi"),
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
expectedThreshold: resource.MustParse("4Gi"),
updateThresholdErr: fmt.Errorf("unexpected error"),
expectErr: true,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
notifierFactory := &MockNotifierFactory{}
notifier := &MockCgroupNotifier{}
m := newTestMemoryThresholdNotifier(tc.evictionThreshold, notifierFactory, nil)
notifierFactory.On("NewCgroupNotifier", testCgroupPath, memoryUsageAttribute, tc.expectedThreshold.Value()).Return(notifier, tc.updateThresholdErr)
var events chan<- struct{}
events = m.events
notifier.On("Start", events).Return()
err := m.UpdateThreshold(nodeSummary(tc.available, tc.workingSet, tc.usage, isAllocatableEvictionThreshold(tc.evictionThreshold)))
if err != nil && !tc.expectErr {
t.Errorf("Unexpected error updating threshold: %v", err)
} else if err == nil && tc.expectErr {
t.Errorf("Expected error updating threshold, but got nil")
}
if !tc.expectErr {
notifierFactory.AssertNumberOfCalls(t, "NewCgroupNotifier", 1)
}
})
}
}
func TestStart(t *testing.T) {
noResources := resource.MustParse("0")
threshold := evictionapi.Threshold{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: &noResources,
},
}
notifier := &MockCgroupNotifier{}
notifierFactory := &MockNotifierFactory{}
var wg sync.WaitGroup
wg.Add(4)
m := newTestMemoryThresholdNotifier(threshold, notifierFactory, func(string) {
wg.Done()
})
notifierFactory.On("NewCgroupNotifier", testCgroupPath, memoryUsageAttribute, int64(0)).Return(notifier, nil)
var events chan<- struct{}
events = m.events
notifier.On("Start", events).Return()
notifier.On("Stop").Return()
err := m.UpdateThreshold(nodeSummary(noResources, noResources, noResources, isAllocatableEvictionThreshold(threshold)))
if err != nil {
t.Errorf("Unexpected error updating threshold: %v", err)
}
notifierFactory.AssertNumberOfCalls(t, "NewCgroupNotifier", 1)
go m.Start()
for i := 0; i < 4; i++ {
m.events <- struct{}{}
}
wg.Wait()
}
func TestThresholdDescription(t *testing.T) {
testCases := []struct {
description string
evictionThreshold evictionapi.Threshold
expectedSubstrings []string
omittedSubstrings []string
}{
{
description: "hard node level threshold",
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("2Gi"),
},
},
expectedSubstrings: []string{"hard"},
omittedSubstrings: []string{"allocatable", "soft"},
},
{
description: "soft node level threshold",
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("2Gi"),
},
GracePeriod: time.Minute * 2,
},
expectedSubstrings: []string{"soft"},
omittedSubstrings: []string{"allocatable", "hard"},
},
{
description: "hard allocatable threshold",
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("2Gi"),
},
GracePeriod: time.Minute * 2,
},
expectedSubstrings: []string{"soft", "allocatable"},
omittedSubstrings: []string{"hard"},
},
{
description: "soft allocatable threshold",
evictionThreshold: evictionapi.Threshold{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("2Gi"),
},
},
expectedSubstrings: []string{"hard", "allocatable"},
omittedSubstrings: []string{"soft"},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
m := &memoryThresholdNotifier{
notifier: &MockCgroupNotifier{},
threshold: tc.evictionThreshold,
cgroupPath: testCgroupPath,
}
desc := m.Description()
for _, expected := range tc.expectedSubstrings {
if !strings.Contains(desc, expected) {
t.Errorf("expected description for notifier with threshold %+v to contain %s, but it did not", tc.evictionThreshold, expected)
}
}
for _, omitted := range tc.omittedSubstrings {
if strings.Contains(desc, omitted) {
t.Errorf("expected description for notifier with threshold %+v NOT to contain %s, but it did", tc.evictionThreshold, omitted)
}
}
})
}
}

View File

@ -0,0 +1,98 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eviction
import (
mock "github.com/stretchr/testify/mock"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
)
// MockCgroupNotifier is a mock implementation of the CgroupNotifier interface
type MockCgroupNotifier struct {
mock.Mock
}
// Start implements the NotifierFactory interface
func (m *MockCgroupNotifier) Start(a0 chan<- struct{}) {
m.Called(a0)
}
// Stop implements the NotifierFactory interface
func (m *MockCgroupNotifier) Stop() {
m.Called()
}
// MockNotifierFactory is a mock of the NotifierFactory interface
type MockNotifierFactory struct {
mock.Mock
}
// NewCgroupNotifier implements the NotifierFactory interface
func (m *MockNotifierFactory) NewCgroupNotifier(a0, a1 string, a2 int64) (CgroupNotifier, error) {
ret := m.Called(a0, a1, a2)
var r0 CgroupNotifier
if rf, ok := ret.Get(0).(func(string, string, int64) CgroupNotifier); ok {
r0 = rf(a0, a1, a2)
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).(CgroupNotifier)
}
}
var r1 error
if rf, ok := ret.Get(1).(func(string, string, int64) error); ok {
r1 = rf(a0, a1, a2)
} else {
r1 = ret.Error(1)
}
return r0, r1
}
// MockThresholdNotifier is a mock implementation of the ThresholdNotifier interface
type MockThresholdNotifier struct {
mock.Mock
}
// Start implements the ThresholdNotifier interface
func (m *MockThresholdNotifier) Start() {
m.Called()
}
// UpdateThreshold implements the ThresholdNotifier interface
func (m *MockThresholdNotifier) UpdateThreshold(a0 *statsapi.Summary) error {
ret := m.Called(a0)
var r0 error
if rf, ok := ret.Get(0).(func(*statsapi.Summary) error); ok {
r0 = rf(a0)
} else {
r0 = ret.Error(0)
}
return r0
}
// Description implements the ThresholdNotifier interface
func (m *MockThresholdNotifier) Description() string {
ret := m.Called()
var r0 string
if rf, ok := ret.Get(0).(func() string); ok {
r0 = rf()
} else {
r0 = ret.String(0)
}
return r0
}

View File

@ -18,43 +18,47 @@ package eviction
import (
"fmt"
"sync"
"time"
"github.com/golang/glog"
"golang.org/x/sys/unix"
)
type memcgThresholdNotifier struct {
watchfd int
controlfd int
eventfd int
handler thresholdNotifierHandlerFunc
description string
const (
// eventSize is the number of bytes returned by a successful read from an eventfd
// see http://man7.org/linux/man-pages/man2/eventfd.2.html for more information
eventSize = 8
// numFdEvents is the number of events we can record at once.
// If EpollWait finds more than this, they will be missed.
numFdEvents = 6
)
type linuxCgroupNotifier struct {
eventfd int
epfd int
stop chan struct{}
stopLock sync.Mutex
}
var _ ThresholdNotifier = &memcgThresholdNotifier{}
var _ CgroupNotifier = &linuxCgroupNotifier{}
// NewMemCGThresholdNotifier sends notifications when a cgroup threshold
// is crossed (in either direction) for a given cgroup attribute
func NewMemCGThresholdNotifier(path, attribute, threshold, description string, handler thresholdNotifierHandlerFunc) (ThresholdNotifier, error) {
watchfd, err := unix.Open(fmt.Sprintf("%s/%s", path, attribute), unix.O_RDONLY, 0)
// NewCgroupNotifier returns a linuxCgroupNotifier, which performs cgroup control operations required
// to receive notifications from the cgroup when the threshold is crossed in either direction.
func NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error) {
var watchfd, eventfd, epfd, controlfd int
var err error
watchfd, err = unix.Open(fmt.Sprintf("%s/%s", path, attribute), unix.O_RDONLY, 0)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
unix.Close(watchfd)
}
}()
controlfd, err := unix.Open(fmt.Sprintf("%s/cgroup.event_control", path), unix.O_WRONLY, 0)
defer unix.Close(watchfd)
controlfd, err = unix.Open(fmt.Sprintf("%s/cgroup.event_control", path), unix.O_WRONLY, 0)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
unix.Close(controlfd)
}
}()
eventfd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
defer unix.Close(controlfd)
eventfd, err = unix.Eventfd(0, unix.EFD_CLOEXEC)
if err != nil {
return nil, err
}
@ -63,55 +67,119 @@ func NewMemCGThresholdNotifier(path, attribute, threshold, description string, h
return nil, err
}
defer func() {
// Close eventfd if we get an error later in initialization
if err != nil {
unix.Close(eventfd)
}
}()
glog.V(2).Infof("eviction: setting notification threshold to %s", threshold)
config := fmt.Sprintf("%d %d %s", eventfd, watchfd, threshold)
epfd, err = unix.EpollCreate1(0)
if err != nil {
return nil, err
}
if epfd < 0 {
err = fmt.Errorf("EpollCreate1 call failed")
return nil, err
}
defer func() {
// Close epfd if we get an error later in initialization
if err != nil {
unix.Close(epfd)
}
}()
config := fmt.Sprintf("%d %d %d", eventfd, watchfd, threshold)
_, err = unix.Write(controlfd, []byte(config))
if err != nil {
return nil, err
}
return &memcgThresholdNotifier{
watchfd: watchfd,
controlfd: controlfd,
eventfd: eventfd,
handler: handler,
description: description,
return &linuxCgroupNotifier{
eventfd: eventfd,
epfd: epfd,
stop: make(chan struct{}),
}, nil
}
func getThresholdEvents(eventfd int, eventCh chan<- struct{}, stopCh <-chan struct{}) {
func (n *linuxCgroupNotifier) Start(eventCh chan<- struct{}) {
err := unix.EpollCtl(n.epfd, unix.EPOLL_CTL_ADD, n.eventfd, &unix.EpollEvent{
Fd: int32(n.eventfd),
Events: unix.EPOLLIN,
})
if err != nil {
glog.Warningf("eviction manager: error adding epoll eventfd: %v", err)
return
}
for {
buf := make([]byte, 8)
_, err := unix.Read(eventfd, buf)
select {
case <-n.stop:
return
default:
}
event, err := wait(n.epfd, n.eventfd, notifierRefreshInterval)
if err != nil {
glog.Warningf("eviction manager: error while waiting for memcg events: %v", err)
return
} else if !event {
// Timeout on wait. This is expected if the threshold was not crossed
continue
}
// Consume the event from the eventfd
buf := make([]byte, eventSize)
_, err = unix.Read(n.eventfd, buf)
if err != nil {
glog.Warningf("eviction manager: error reading memcg events: %v", err)
return
}
select {
case eventCh <- struct{}{}:
case <-stopCh:
return
}
eventCh <- struct{}{}
}
}
func (n *memcgThresholdNotifier) Start(stopCh <-chan struct{}) {
eventCh := make(chan struct{})
go getThresholdEvents(n.eventfd, eventCh, stopCh)
for {
select {
case <-stopCh:
glog.V(2).Infof("eviction: stopping threshold notifier")
unix.Close(n.watchfd)
unix.Close(n.controlfd)
unix.Close(n.eventfd)
return
case <-eventCh:
glog.V(2).Infof("eviction: threshold crossed")
n.handler(n.description)
// wait waits up to notifierRefreshInterval for an event on the Epoll FD for the
// eventfd we are concerned about. It returns an error if one occurrs, and true
// if the consumer should read from the eventfd.
func wait(epfd, eventfd int, timeout time.Duration) (bool, error) {
events := make([]unix.EpollEvent, numFdEvents+1)
timeoutMS := int(timeout / time.Millisecond)
n, err := unix.EpollWait(epfd, events, timeoutMS)
if n == -1 {
if err == unix.EINTR {
// Interrupt, ignore the error
return false, nil
}
return false, err
}
if n == 0 {
// Timeout
return false, nil
}
if n > numFdEvents {
return false, fmt.Errorf("epoll_wait returned more events than we know what to do with")
}
for _, event := range events[:n] {
if event.Fd == int32(eventfd) {
if event.Events&unix.EPOLLHUP != 0 || event.Events&unix.EPOLLERR != 0 || event.Events&unix.EPOLLIN != 0 {
// EPOLLHUP: should not happen, but if it does, treat it as a wakeup.
// EPOLLERR: If an error is waiting on the file descriptor, we should pretend
// something is ready to read, and let unix.Read pick up the error.
// EPOLLIN: There is data to read.
return true, nil
}
}
}
// An event occurred that we don't care about.
return false, nil
}
func (n *linuxCgroupNotifier) Stop() {
n.stopLock.Lock()
defer n.stopLock.Unlock()
select {
case <-n.stop:
// the linuxCgroupNotifier is already stopped
return
default:
}
unix.Close(n.eventfd)
unix.Close(n.epfd)
close(n.stop)
}

View File

@ -18,10 +18,16 @@ limitations under the License.
package eviction
import "fmt"
import "github.com/golang/glog"
// NewMemCGThresholdNotifier sends notifications when a cgroup threshold
// is crossed (in either direction) for a given cgroup attribute
func NewMemCGThresholdNotifier(path, attribute, threshold, description string, handler thresholdNotifierHandlerFunc) (ThresholdNotifier, error) {
return nil, fmt.Errorf("threshold notification not supported")
// NewCgroupNotifier creates a cgroup notifier that does nothing because cgroups do not exist on non-linux systems.
func NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error) {
glog.V(5).Infof("cgroup notifications not supported")
return &unsupportedThresholdNotifier{}, nil
}
type unsupportedThresholdNotifier struct{}
func (*unsupportedThresholdNotifier) Start(_ chan<- struct{}) {}
func (*unsupportedThresholdNotifier) Stop() {}

View File

@ -48,6 +48,8 @@ type Config struct {
Thresholds []evictionapi.Threshold
// KernelMemcgNotification if true will integrate with the kernel memcg notification to determine if memory thresholds are crossed.
KernelMemcgNotification bool
// PodCgroupRoot is the cgroup which contains all pods.
PodCgroupRoot string
}
// Manager evaluates when an eviction threshold for node stability has been met on the node.
@ -129,10 +131,30 @@ type nodeReclaimFunc func() error
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc
type nodeReclaimFuncs []nodeReclaimFunc
// thresholdNotifierHandlerFunc is a function that takes action in response to a crossed threshold
type thresholdNotifierHandlerFunc func(thresholdDescription string)
// ThresholdNotifier notifies the user when an attribute crosses a threshold value
type ThresholdNotifier interface {
Start(stopCh <-chan struct{})
// CgroupNotifier generates events from cgroup events
type CgroupNotifier interface {
// Start causes the CgroupNotifier to begin notifying on the eventCh
Start(eventCh chan<- struct{})
// Stop stops all processes and cleans up file descriptors associated with the CgroupNotifier
Stop()
}
// NotifierFactory creates CgroupNotifer
type NotifierFactory interface {
// NewCgroupNotifier creates a CgroupNotifier that creates events when the threshold
// on the attribute in the cgroup specified by the path is crossed.
NewCgroupNotifier(path, attribute string, threshold int64) (CgroupNotifier, error)
}
// ThresholdNotifier manages CgroupNotifiers based on memory eviction thresholds, and performs a function
// when memory eviction thresholds are crossed
type ThresholdNotifier interface {
// Start calls the notifier function when the CgroupNotifier notifies the ThresholdNotifier that an event occurred
Start()
// UpdateThreshold updates the memory cgroup threshold based on the metrics provided.
// Calling UpdateThreshold with recent metrics allows the ThresholdNotifier to trigger at the
// eviction threshold more accurately
UpdateThreshold(summary *statsapi.Summary) error
// Description produces a relevant string describing the Memory Threshold Notifier
Description() string
}