vendor update for CSI 0.3.0

This commit is contained in:
gman
2018-07-18 16:47:22 +02:00
parent 6f484f92fc
commit 8ea659f0d5
6810 changed files with 438061 additions and 193861 deletions

View File

@ -91,9 +91,10 @@ go_library(
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//pkg/scheduler/schedulercache:go_default_library",
"//pkg/scheduler/cache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
@ -140,7 +141,6 @@ go_library(
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
@ -188,6 +188,7 @@ go_test(
"container_manager_linux_test.go",
"helpers_linux_test.go",
"node_container_manager_test.go",
"pod_container_manager_linux_test.go",
],
"//conditions:default": [],
}),
@ -200,6 +201,7 @@ go_test(
"//vendor/github.com/stretchr/testify/require:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
],
"//conditions:default": [],
}),

View File

@ -53,74 +53,79 @@ const (
// which is what is expected when interacting with libcontainer
var hugePageSizeList = []string{"B", "kB", "MB", "GB", "TB", "PB"}
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
// For example, it will return /Burstable.slice/Burstable-pod_123_456.slice in above scenario.
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
name := string(cgroupName)
result := ""
if name != "" && name != "/" {
parts := strings.Split(name, "/")
results := []string{}
for _, part := range parts {
// ignore leading stuff
if part == "" {
continue
}
// detect if we are given a systemd style name.
// if so, we do not want to do double encoding.
if IsSystemdStyleName(part) {
part = strings.TrimSuffix(part, systemdSuffix)
separatorIndex := strings.LastIndex(part, "-")
if separatorIndex >= 0 && separatorIndex < len(part) {
part = part[separatorIndex+1:]
}
} else {
// systemd treats - as a step in the hierarchy, we convert all - to _
part = strings.Replace(part, "-", "_", -1)
}
results = append(results, part)
var RootCgroupName = CgroupName([]string{})
// NewCgroupName composes a new cgroup name.
// Use RootCgroupName as base to start at the root.
// This function does some basic check for invalid characters at the name.
func NewCgroupName(base CgroupName, components ...string) CgroupName {
for _, component := range components {
// Forbit using "_" in internal names. When remapping internal
// names to systemd cgroup driver, we want to remap "-" => "_",
// so we forbid "_" so that we can always reverse the mapping.
if strings.Contains(component, "/") || strings.Contains(component, "_") {
panic(fmt.Errorf("invalid character in component [%q] of CgroupName", component))
}
// each part is appended with systemd style -
result = strings.Join(results, "-")
} else {
// root converts to -
result = "-"
}
// always have a .slice suffix
if !IsSystemdStyleName(result) {
result = result + systemdSuffix
return CgroupName(append(base, components...))
}
func escapeSystemdCgroupName(part string) string {
return strings.Replace(part, "-", "_", -1)
}
func unescapeSystemdCgroupName(part string) string {
return strings.Replace(part, "_", "-", -1)
}
// cgroupName.ToSystemd converts the internal cgroup name to a systemd name.
// For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes
// "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice"
// This function always expands the systemd name into the cgroupfs form. If only
// the last part is needed, use path.Base(...) on it to discard the rest.
func (cgroupName CgroupName) ToSystemd() string {
if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") {
return "/"
}
newparts := []string{}
for _, part := range cgroupName {
part = escapeSystemdCgroupName(part)
newparts = append(newparts, part)
}
// if the caller desired the result in cgroupfs format...
if outputToCgroupFs {
var err error
result, err = cgroupsystemd.ExpandSlice(result)
if err != nil {
panic(fmt.Errorf("error adapting cgroup name, input: %v, err: %v", name, err))
}
result, err := cgroupsystemd.ExpandSlice(strings.Join(newparts, "-") + systemdSuffix)
if err != nil {
// Should never happen...
panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err))
}
return result
}
// ConvertCgroupFsNameToSystemd converts an expanded cgroupfs name to its systemd name.
// For example, it will convert test.slice/test-a.slice/test-a-b.slice to become test-a-b.slice
// NOTE: this is public right now to allow its usage in dockermanager and dockershim, ideally both those
// code areas could use something from libcontainer if we get this style function upstream.
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
// TODO: see if libcontainer systemd implementation could use something similar, and if so, move
// this function up to that library. At that time, it would most likely do validation specific to systemd
// above and beyond the simple assumption here that the base of the path encodes the hierarchy
// per systemd convention.
return path.Base(cgroupfsName), nil
func ParseSystemdToCgroupName(name string) CgroupName {
driverName := path.Base(name)
driverName = strings.TrimSuffix(driverName, systemdSuffix)
parts := strings.Split(driverName, "-")
result := []string{}
for _, part := range parts {
result = append(result, unescapeSystemdCgroupName(part))
}
return CgroupName(result)
}
func (cgroupName CgroupName) ToCgroupfs() string {
return "/" + path.Join(cgroupName...)
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
components := strings.Split(strings.TrimPrefix(name, "/"), "/")
if len(components) == 1 && components[0] == "" {
components = []string{}
}
return CgroupName(components)
}
func IsSystemdStyleName(name string) bool {
if strings.HasSuffix(name, systemdSuffix) {
return true
}
return false
return strings.HasSuffix(name, systemdSuffix)
}
// libcontainerAdapter provides a simplified interface to libcontainer based on libcontainer type.
@ -156,34 +161,6 @@ func (l *libcontainerAdapter) newManager(cgroups *libcontainerconfigs.Cgroup, pa
return nil, fmt.Errorf("invalid cgroup manager configuration")
}
func (l *libcontainerAdapter) revertName(name string) CgroupName {
if l.cgroupManagerType != libcontainerSystemd {
return CgroupName(name)
}
return CgroupName(RevertFromSystemdToCgroupStyleName(name))
}
func RevertFromSystemdToCgroupStyleName(name string) string {
driverName, err := ConvertCgroupFsNameToSystemd(name)
if err != nil {
panic(err)
}
driverName = strings.TrimSuffix(driverName, systemdSuffix)
driverName = strings.Replace(driverName, "-", "/", -1)
driverName = strings.Replace(driverName, "_", "-", -1)
return driverName
}
// adaptName converts a CgroupName identifier to a driver specific conversion value.
// if outputToCgroupFs is true, the result is returned in the cgroupfs format rather than the driver specific form.
func (l *libcontainerAdapter) adaptName(cgroupName CgroupName, outputToCgroupFs bool) string {
if l.cgroupManagerType != libcontainerSystemd {
name := string(cgroupName)
return name
}
return ConvertCgroupNameToSystemd(cgroupName, outputToCgroupFs)
}
// CgroupSubsystems holds information about the mounted cgroup subsystems
type CgroupSubsystems struct {
// Cgroup subsystem mounts.
@ -223,13 +200,22 @@ func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
}
// Name converts the cgroup to the driver specific value in cgroupfs form.
// This always returns a valid cgroupfs path even when systemd driver is in use!
func (m *cgroupManagerImpl) Name(name CgroupName) string {
return m.adapter.adaptName(name, true)
if m.adapter.cgroupManagerType == libcontainerSystemd {
return name.ToSystemd()
} else {
return name.ToCgroupfs()
}
}
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
func (m *cgroupManagerImpl) CgroupName(name string) CgroupName {
return m.adapter.revertName(name)
if m.adapter.cgroupManagerType == libcontainerSystemd {
return ParseSystemdToCgroupName(name)
} else {
return ParseCgroupfsToCgroupName(name)
}
}
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
@ -242,6 +228,22 @@ func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string
return cgroupPaths
}
// TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead.
// It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent)
// and split it appropriately, using essentially the logic below.
// This was done for cgroupfs in opencontainers/runc#497 but a counterpart
// for systemd was never introduced.
func updateSystemdCgroupInfo(cgroupConfig *libcontainerconfigs.Cgroup, cgroupName CgroupName) {
dir, base := path.Split(cgroupName.ToSystemd())
if dir == "/" {
dir = "-.slice"
} else {
dir = path.Base(dir)
}
cgroupConfig.Parent = dir
cgroupConfig.Name = base
}
// Exists checks if all subsystem cgroups already exist
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
// Get map of all cgroup paths on the system for the particular cgroup
@ -278,23 +280,13 @@ func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{}
// libcontainer consumes a different field and expects a different syntax
// depending on the cgroup driver in use, so we need this conditional here.
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name)
} else {
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
}
manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths)
@ -418,26 +410,17 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
Paths: cgroupPaths,
}
// libcontainer consumes a different field and expects a different syntax
// depending on the cgroup driver in use, so we need this conditional here.
if m.adapter.cgroupManagerType == libcontainerSystemd {
updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name)
} else {
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
@ -456,25 +439,18 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start))
}()
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
resources := m.toResources(cgroupConfig.ResourceParameters)
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
}
// libcontainer consumes a different field and expects a different syntax
// depending on the cgroup driver in use, so we need this conditional here.
if m.adapter.cgroupManagerType == libcontainerSystemd {
updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name)
} else {
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit

View File

@ -18,119 +18,105 @@ limitations under the License.
package cm
import "testing"
import (
"path"
"testing"
)
func TestLibcontainerAdapterAdaptToSystemd(t *testing.T) {
func TestCgroupNameToSystemdBasename(t *testing.T) {
testCases := []struct {
input string
input CgroupName
expected string
}{
{
input: "/",
expected: "-.slice",
input: RootCgroupName,
expected: "/",
},
{
input: "/system.slice",
input: NewCgroupName(RootCgroupName, "system"),
expected: "system.slice",
},
{
input: "/system.slice/Burstable",
input: NewCgroupName(RootCgroupName, "system", "Burstable"),
expected: "system-Burstable.slice",
},
{
input: "/Burstable.slice/Burstable-pod_123.slice",
input: NewCgroupName(RootCgroupName, "Burstable", "pod-123"),
expected: "Burstable-pod_123.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice",
input: NewCgroupName(RootCgroupName, "test", "a", "b"),
expected: "test-a-b.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice/Burstable",
input: NewCgroupName(RootCgroupName, "test", "a", "b", "Burstable"),
expected: "test-a-b-Burstable.slice",
},
{
input: "/Burstable",
input: NewCgroupName(RootCgroupName, "Burstable"),
expected: "Burstable.slice",
},
{
input: "/Burstable/pod_123",
expected: "Burstable-pod_123.slice",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
input: NewCgroupName(RootCgroupName, "BestEffort", "pod-6c1a4e95-6bb6-11e6-bc26-28d2444e470d"),
expected: "BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), false); actual != testCase.expected {
if actual := path.Base(testCase.input.ToSystemd()); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}
func TestLibcontainerAdapterAdaptToSystemdAsCgroupFs(t *testing.T) {
func TestCgroupNameToSystemd(t *testing.T) {
testCases := []struct {
input string
input CgroupName
expected string
}{
{
input: "/",
input: RootCgroupName,
expected: "/",
},
{
input: "/Burstable",
input: NewCgroupName(RootCgroupName, "Burstable"),
expected: "/Burstable.slice",
},
{
input: "/Burstable/pod_123",
input: NewCgroupName(RootCgroupName, "Burstable", "pod-123"),
expected: "/Burstable.slice/Burstable-pod_123.slice",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
input: NewCgroupName(RootCgroupName, "BestEffort", "pod-6c1a4e95-6bb6-11e6-bc26-28d2444e470d"),
expected: "/BestEffort.slice/BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice",
},
{
input: "/kubepods",
input: NewCgroupName(RootCgroupName, "kubepods"),
expected: "/kubepods.slice",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), true); actual != testCase.expected {
if actual := testCase.input.ToSystemd(); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}
func TestLibcontainerAdapterNotAdaptToSystemd(t *testing.T) {
cgroupfs := newLibcontainerAdapter(libcontainerCgroupfs)
otherAdatper := newLibcontainerAdapter(libcontainerCgroupManagerType("test"))
func TestCgroupNameToCgroupfs(t *testing.T) {
testCases := []struct {
input string
input CgroupName
expected string
}{
{
input: "/",
input: RootCgroupName,
expected: "/",
},
{
input: "/Burstable",
input: NewCgroupName(RootCgroupName, "Burstable"),
expected: "/Burstable",
},
{
input: "",
expected: "",
},
}
for _, testCase := range testCases {
if actual := cgroupfs.adaptName(CgroupName(testCase.input), true); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
if actual := otherAdatper.adaptName(CgroupName(testCase.input), true); actual != testCase.expected {
if actual := testCase.input.ToCgroupfs(); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}

View File

@ -63,25 +63,35 @@ func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
}
func (m *unsupportedCgroupManager) CgroupName(name string) CgroupName {
return ""
return CgroupName([]string{})
}
func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error {
return nil
}
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
return "", nil
var RootCgroupName = CgroupName([]string{})
func NewCgroupName(base CgroupName, components ...string) CgroupName {
return CgroupName(append(base, components...))
}
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
func (cgroupName CgroupName) ToSystemd() string {
return ""
}
func RevertFromSystemdToCgroupStyleName(name string) string {
func ParseSystemdToCgroupName(name string) CgroupName {
return nil
}
func (cgroupName CgroupName) ToCgroupfs() string {
return ""
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
return nil
}
func IsSystemdStyleName(name string) bool {
return false
}

View File

@ -28,7 +28,7 @@ import (
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
"fmt"
"strconv"
@ -107,10 +107,11 @@ type NodeConfig struct {
KubeletRootDir string
ProtectKernelDefaults bool
NodeAllocatableConfig
ExperimentalQOSReserved map[v1.ResourceName]int64
QOSReserved map[v1.ResourceName]int64
ExperimentalCPUManagerPolicy string
ExperimentalCPUManagerReconcilePeriod time.Duration
ExperimentalPodPidsLimit int64
EnforceCPULimits bool
}
type NodeAllocatableConfig struct {

View File

@ -52,7 +52,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
utilfile "k8s.io/kubernetes/pkg/util/file"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/pkg/util/oom"
@ -123,7 +123,7 @@ type containerManagerImpl struct {
capacity v1.ResourceList
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
// This path include a top level container for enforcing Node Allocatable.
cgroupRoot string
cgroupRoot CgroupName
// Event recorder interface.
recorder record.EventRecorder
// Interface for QoS cgroup management
@ -223,7 +223,8 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
}
capacity = cadvisor.CapacityFromMachineInfo(machineInfo)
cgroupRoot := nodeConfig.CgroupRoot
// Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
// Check if Cgroup-root actually exists on the node
if nodeConfig.CgroupsPerQOS {
@ -236,13 +237,13 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
// of note, we always use the cgroupfs driver when performing this check since
// the input is provided in that format.
// this is important because we do not want any name conversion to occur.
if !cgroupManager.Exists(CgroupName(cgroupRoot)) {
if !cgroupManager.Exists(cgroupRoot) {
return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist: %v", cgroupRoot, err)
}
glog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot)
// Include the top level cgroup for enforcing node allocatable into cgroup-root.
// This way, all sub modules can avoid having to understand the concept of node allocatable.
cgroupRoot = path.Join(cgroupRoot, defaultNodeAllocatableCgroupName)
cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName)
}
glog.Infof("Creating Container Manager object based on Node Config: %+v", nodeConfig)
@ -301,10 +302,11 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
subsystems: cm.subsystems,
cgroupManager: cm.cgroupManager,
podPidsLimit: cm.ExperimentalPodPidsLimit,
enforceCPULimits: cm.EnforceCPULimits,
}
}
return &podContainerManagerNoop{
cgroupRoot: CgroupName(cm.cgroupRoot),
cgroupRoot: cm.cgroupRoot,
}
}
@ -502,7 +504,7 @@ func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
// GetPodCgroupRoot returns the literal cgroupfs value for the cgroup containing all pods.
func (cm *containerManagerImpl) GetPodCgroupRoot() string {
return cm.cgroupManager.Name(CgroupName(cm.cgroupRoot))
return cm.cgroupManager.Name(cm.cgroupRoot)
}
func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
@ -858,21 +860,6 @@ func isKernelPid(pid int) bool {
return err != nil
}
// Helper for getting the docker API version.
func getDockerAPIVersion(cadvisor cadvisor.Interface) *utilversion.Version {
versions, err := cadvisor.VersionInfo()
if err != nil {
glog.Errorf("Error requesting cAdvisor VersionInfo: %v", err)
return utilversion.MustParseSemantic("0.0")
}
dockerAPIVersion, err := utilversion.ParseGeneric(versions.DockerAPIVersion)
if err != nil {
glog.Errorf("Error parsing docker version %q: %v", versions.DockerVersion, err)
return utilversion.MustParseSemantic("0.0")
}
return dockerAPIVersion
}
func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
return cm.capacity
}

View File

@ -19,6 +19,7 @@ limitations under the License.
package cm
import (
"errors"
"fmt"
"io/ioutil"
"os"
@ -91,8 +92,36 @@ func (mi *fakeMountInterface) MakeFile(pathname string) error {
return nil
}
func (mi *fakeMountInterface) ExistsPath(pathname string) bool {
return true
func (mi *fakeMountInterface) ExistsPath(pathname string) (bool, error) {
return true, errors.New("not implemented")
}
func (mi *fakeMountInterface) PrepareSafeSubpath(subPath mount.Subpath) (newHostPath string, cleanupAction func(), err error) {
return "", nil, nil
}
func (mi *fakeMountInterface) CleanSubPaths(_, _ string) error {
return nil
}
func (mi *fakeMountInterface) SafeMakeDir(_, _ string, _ os.FileMode) error {
return nil
}
func (mi *fakeMountInterface) GetMountRefs(pathname string) ([]string, error) {
return nil, errors.New("not implemented")
}
func (mi *fakeMountInterface) GetFSGroup(pathname string) (int64, error) {
return -1, errors.New("not implemented")
}
func (mi *fakeMountInterface) GetSELinuxSupport(pathname string) (bool, error) {
return false, errors.New("not implemented")
}
func (mi *fakeMountInterface) GetMode(pathname string) (os.FileMode, error) {
return 0, errors.New("not implemented")
}
func fakeContainerMgrMountInt() mount.Interface {

View File

@ -20,13 +20,14 @@ import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
)
type containerManagerStub struct{}
@ -67,7 +68,12 @@ func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList
}
func (cm *containerManagerStub) GetCapacity() v1.ResourceList {
return nil
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(0),
resource.BinarySI),
}
return c
}
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {

View File

@ -18,7 +18,6 @@ package state
import (
"bytes"
"flag"
"fmt"
"io"
"io/ioutil"
@ -69,9 +68,6 @@ func stderrCapture(t *testing.T, f func() State) (bytes.Buffer, State) {
}
func TestFileStateTryRestore(t *testing.T) {
flag.Set("alsologtostderr", "true")
flag.Parse()
testCases := []struct {
description string
stateFileContent string
@ -292,9 +288,6 @@ func TestFileStateTryRestorePanic(t *testing.T) {
}
func TestUpdateStateFile(t *testing.T) {
flag.Set("alsologtostderr", "true")
flag.Parse()
testCases := []struct {
description string
expErr string

View File

@ -56,9 +56,6 @@ func (s *stateMemory) GetDefaultCPUSet() cpuset.CPUSet {
}
func (s *stateMemory) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
if res, ok := s.GetCPUSet(containerID); ok {
return res
}

View File

@ -15,15 +15,15 @@ go_library(
deps = [
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/kubelet/apis/deviceplugin/v1beta1:go_default_library",
"//pkg/kubelet/checkpointmanager:go_default_library",
"//pkg/kubelet/checkpointmanager/errors:go_default_library",
"//pkg/kubelet/cm/devicemanager/checkpoint:go_default_library",
"//pkg/kubelet/config:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/util/store:go_default_library",
"//pkg/scheduler/schedulercache:go_default_library",
"//pkg/util/filesystem:go_default_library",
"//pkg/scheduler/cache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/golang.org/x/net/context:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
@ -40,10 +40,9 @@ go_test(
embed = [":go_default_library"],
deps = [
"//pkg/kubelet/apis/deviceplugin/v1beta1:go_default_library",
"//pkg/kubelet/checkpointmanager:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/util/store:go_default_library",
"//pkg/scheduler/schedulercache:go_default_library",
"//pkg/util/filesystem:go_default_library",
"//pkg/scheduler/cache:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/github.com/stretchr/testify/require:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
@ -63,7 +62,10 @@ filegroup(
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
srcs = [
":package-srcs",
"//pkg/kubelet/cm/devicemanager/checkpoint:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,26 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["checkpoint.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint",
visibility = ["//visibility:public"],
deps = [
"//pkg/kubelet/checkpointmanager:go_default_library",
"//pkg/kubelet/checkpointmanager/checksum:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,81 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checkpoint
import (
"encoding/json"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
type DeviceManagerCheckpoint interface {
checkpointmanager.Checkpoint
GetData() ([]PodDevicesEntry, map[string][]string)
}
type PodDevicesEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs []string
AllocResp []byte
}
// checkpointData struct is used to store pod to device allocation information
// in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []PodDevicesEntry
RegisteredDevices map[string][]string
}
type Data struct {
Data checkpointData
Checksum checksum.Checksum
}
// NewDeviceManagerCheckpoint returns an instance of Checkpoint
func New(devEntries []PodDevicesEntry,
devices map[string][]string) DeviceManagerCheckpoint {
return &Data{
Data: checkpointData{
PodDeviceEntries: devEntries,
RegisteredDevices: devices,
},
}
}
// MarshalCheckpoint returns marshalled data
func (cp *Data) MarshalCheckpoint() ([]byte, error) {
cp.Checksum = checksum.New(cp.Data)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint returns unmarshalled data
func (cp *Data) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that passed checksum is same as calculated checksum
func (cp *Data) VerifyChecksum() error {
return cp.Checksum.Verify(cp.Data)
}
func (cp *Data) GetData() ([]PodDevicesEntry, map[string][]string) {
return cp.Data.PodDeviceEntries, cp.Data.RegisteredDevices
}

View File

@ -17,13 +17,14 @@ limitations under the License.
package devicemanager
import (
"context"
"log"
"net"
"os"
"path"
"sync"
"time"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
@ -35,6 +36,7 @@ type Stub struct {
socket string
stop chan interface{}
wg sync.WaitGroup
update chan []*pluginapi.Device
server *grpc.Server
@ -70,7 +72,8 @@ func (m *Stub) SetAllocFunc(f stubAllocFunc) {
m.allocFunc = f
}
// Start starts the gRPC server of the device plugin
// Start starts the gRPC server of the device plugin. Can only
// be called once.
func (m *Stub) Start() error {
err := m.cleanup()
if err != nil {
@ -82,10 +85,14 @@ func (m *Stub) Start() error {
return err
}
m.wg.Add(1)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
go m.server.Serve(sock)
go func() {
defer m.wg.Done()
m.server.Serve(sock)
}()
_, conn, err := dial(m.socket)
if err != nil {
return err
@ -96,18 +103,27 @@ func (m *Stub) Start() error {
return nil
}
// Stop stops the gRPC server
// Stop stops the gRPC server. Can be called without a prior Start
// and more than once. Not safe to be called concurrently by different
// goroutines!
func (m *Stub) Stop() error {
if m.server == nil {
return nil
}
m.server.Stop()
close(m.stop)
m.wg.Wait()
m.server = nil
close(m.stop) // This prevents re-starting the server.
return m.cleanup()
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *Stub) Register(kubeletEndpoint, resourceName string, preStartContainerFlag bool) error {
conn, err := grpc.Dial(kubeletEndpoint, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(10*time.Second),
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := grpc.DialContext(ctx, kubeletEndpoint, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}))

View File

@ -17,13 +17,13 @@ limitations under the License.
package devicemanager
import (
"context"
"fmt"
"net"
"sync"
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
@ -39,6 +39,8 @@ type endpoint interface {
preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
getDevices() []pluginapi.Device
callback(resourceName string, added, updated, deleted []pluginapi.Device)
isStopped() bool
stopGracePeriodExpired() bool
}
type endpointImpl struct {
@ -47,6 +49,7 @@ type endpointImpl struct {
socketPath string
resourceName string
stopTime time.Time
devices map[string]pluginapi.Device
mutex sync.Mutex
@ -55,6 +58,7 @@ type endpointImpl struct {
}
// newEndpoint creates a new endpoint for the given resourceName.
// This is to be used during normal device plugin registration.
func newEndpointImpl(socketPath, resourceName string, devices map[string]pluginapi.Device, callback monitorCallback) (*endpointImpl, error) {
client, c, err := dial(socketPath)
if err != nil {
@ -74,6 +78,16 @@ func newEndpointImpl(socketPath, resourceName string, devices map[string]plugina
}, nil
}
// newStoppedEndpointImpl creates a new endpoint for the given resourceName with stopTime set.
// This is to be used during Kubelet restart, before the actual device plugin re-registers.
func newStoppedEndpointImpl(resourceName string, devices map[string]pluginapi.Device) *endpointImpl {
return &endpointImpl{
resourceName: resourceName,
devices: devices,
stopTime: time.Now(),
}
}
func (e *endpointImpl) callback(resourceName string, added, updated, deleted []pluginapi.Device) {
e.cb(resourceName, added, updated, deleted)
}
@ -176,8 +190,30 @@ func (e *endpointImpl) run() {
}
}
func (e *endpointImpl) isStopped() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero()
}
func (e *endpointImpl) stopGracePeriodExpired() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero() && time.Since(e.stopTime) > endpointStopGracePeriod
}
// used for testing only
func (e *endpointImpl) setStopTime(t time.Time) {
e.mutex.Lock()
defer e.mutex.Unlock()
e.stopTime = t
}
// allocate issues Allocate gRPC call to the device plugin.
func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.client.Allocate(context.Background(), &pluginapi.AllocateRequest{
ContainerRequests: []*pluginapi.ContainerAllocateRequest{
{DevicesIDs: devs},
@ -187,6 +223,9 @@ func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, err
// preStartContainer issues PreStartContainer gRPC call to the device plugin.
func (e *endpointImpl) preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
ctx, cancel := context.WithTimeout(context.Background(), pluginapi.KubeletPreStartContainerRPCTimeoutInSecs*time.Second)
defer cancel()
return e.client.PreStartContainer(ctx, &pluginapi.PreStartContainerRequest{
@ -195,13 +234,20 @@ func (e *endpointImpl) preStartContainer(devs []string) (*pluginapi.PreStartCont
}
func (e *endpointImpl) stop() {
e.clientConn.Close()
e.mutex.Lock()
defer e.mutex.Unlock()
if e.clientConn != nil {
e.clientConn.Close()
}
e.stopTime = time.Now()
}
// dial establishes the gRPC communication with the registered device plugin. https://godoc.org/google.golang.org/grpc#Dial
func dial(unixSocketPath string) (pluginapi.DevicePluginClient, *grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithTimeout(10*time.Second),
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
c, err := grpc.DialContext(ctx, unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),

View File

@ -17,7 +17,7 @@ limitations under the License.
package devicemanager
import (
"encoding/json"
"context"
"fmt"
"net"
"os"
@ -26,7 +26,6 @@ import (
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
"k8s.io/api/core/v1"
@ -34,12 +33,13 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
@ -59,6 +59,7 @@ type ManagerImpl struct {
mutex sync.Mutex
server *grpc.Server
wg sync.WaitGroup
// activePods is a method for listing active pods on the node
// so the amount of pluginResources requested by existing pods
@ -83,9 +84,9 @@ type ManagerImpl struct {
allocatedDevices map[string]sets.String
// podDevices contains pod to allocated device mapping.
podDevices podDevices
store utilstore.Store
pluginOpts map[string]*pluginapi.DevicePluginOptions
podDevices podDevices
pluginOpts map[string]*pluginapi.DevicePluginOptions
checkpointManager checkpointmanager.CheckpointManager
}
type sourcesReadyStub struct{}
@ -122,11 +123,11 @@ func newManagerImpl(socketPath string) (*ManagerImpl, error) {
// Before that, initializes them to perform no-op operations.
manager.activePods = func() []*v1.Pod { return []*v1.Pod{} }
manager.sourcesReady = &sourcesReadyStub{}
var err error
manager.store, err = utilstore.NewFileStore(dir, utilfs.DefaultFs{})
checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
if err != nil {
return nil, fmt.Errorf("failed to initialize device plugin checkpointing store: %+v", err)
return nil, fmt.Errorf("failed to initialize checkpoint manager: %+v", err)
}
manager.checkpointManager = checkpointManager
return manager, nil
}
@ -188,11 +189,6 @@ func (m *ManagerImpl) removeContents(dir string) error {
return nil
}
const (
// kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint
kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"
)
// checkpointFile returns device plugin checkpoint file path.
func (m *ManagerImpl) checkpointFile() string {
return filepath.Join(m.socketdir, kubeletDeviceManagerCheckpoint)
@ -229,10 +225,14 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
return err
}
m.wg.Add(1)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterRegistrationServer(m.server, m)
go m.server.Serve(s)
go func() {
defer m.wg.Done()
m.server.Serve(s)
}()
glog.V(2).Infof("Serving device plugin registration server on %q", socketPath)
@ -318,6 +318,8 @@ func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest
}
// Stop is the function that can stop the gRPC server.
// Can be called concurrently, more than once, and is safe to call
// without a prior Start.
func (m *ManagerImpl) Stop() error {
m.mutex.Lock()
defer m.mutex.Unlock()
@ -325,7 +327,12 @@ func (m *ManagerImpl) Stop() error {
e.stop()
}
if m.server == nil {
return nil
}
m.server.Stop()
m.wg.Wait()
m.server = nil
return nil
}
@ -338,6 +345,7 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
// to avoid potential orphaned devices upon re-registration
devices := make(map[string]pluginapi.Device)
for _, device := range old.getDevices() {
device.Health = pluginapi.Unhealthy
devices[device.ID] = device
}
existingDevs = devices
@ -377,18 +385,28 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
go func() {
e.run()
e.stop()
m.mutex.Lock()
if old, ok := m.endpoints[r.ResourceName]; ok && old == e {
glog.V(2).Infof("Delete resource for endpoint %v", e)
delete(m.endpoints, r.ResourceName)
m.markResourceUnhealthy(r.ResourceName)
}
glog.V(2).Infof("Unregistered endpoint %v", e)
m.mutex.Unlock()
}()
}
func (m *ManagerImpl) markResourceUnhealthy(resourceName string) {
glog.V(2).Infof("Mark all resources Unhealthy for resource %s", resourceName)
healthyDevices := sets.NewString()
if _, ok := m.healthyDevices[resourceName]; ok {
healthyDevices = m.healthyDevices[resourceName]
m.healthyDevices[resourceName] = sets.NewString()
}
if _, ok := m.unhealthyDevices[resourceName]; !ok {
m.unhealthyDevices[resourceName] = sets.NewString()
}
m.unhealthyDevices[resourceName] = m.unhealthyDevices[resourceName].Union(healthyDevices)
}
// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains the registered device plugin resource allocatable.
@ -405,12 +423,20 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
needsUpdateCheckpoint := false
var capacity = v1.ResourceList{}
var allocatable = v1.ResourceList{}
var deletedResources []string
deletedResources := sets.NewString()
m.mutex.Lock()
for resourceName, devices := range m.healthyDevices {
if _, ok := m.endpoints[resourceName]; !ok {
e, ok := m.endpoints[resourceName]
if (ok && e.stopGracePeriodExpired()) || !ok {
// The resources contained in endpoints and (un)healthyDevices
// should always be consistent. Otherwise, we run with the risk
// of failing to garbage collect non-existing resources or devices.
if !ok {
glog.Errorf("unexpected: healthyDevices and endpoints are out of sync")
}
delete(m.endpoints, resourceName)
delete(m.healthyDevices, resourceName)
deletedResources = append(deletedResources, resourceName)
deletedResources.Insert(resourceName)
needsUpdateCheckpoint = true
} else {
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
@ -418,17 +444,14 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
}
}
for resourceName, devices := range m.unhealthyDevices {
if _, ok := m.endpoints[resourceName]; !ok {
e, ok := m.endpoints[resourceName]
if (ok && e.stopGracePeriodExpired()) || !ok {
if !ok {
glog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
}
delete(m.endpoints, resourceName)
delete(m.unhealthyDevices, resourceName)
alreadyDeleted := false
for _, name := range deletedResources {
if name == resourceName {
alreadyDeleted = true
}
}
if !alreadyDeleted {
deletedResources = append(deletedResources, resourceName)
}
deletedResources.Insert(resourceName)
needsUpdateCheckpoint = true
} else {
capacityCount := capacity[v1.ResourceName(resourceName)]
@ -441,36 +464,22 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
if needsUpdateCheckpoint {
m.writeCheckpoint()
}
return capacity, allocatable, deletedResources
}
// checkpointData struct is used to store pod to device allocation information
// and registered device information in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []podDevicesCheckpointEntry
RegisteredDevices map[string][]string
return capacity, allocatable, deletedResources.UnsortedList()
}
// Checkpoints device to container allocation information to disk.
func (m *ManagerImpl) writeCheckpoint() error {
m.mutex.Lock()
data := checkpointData{
PodDeviceEntries: m.podDevices.toCheckpointData(),
RegisteredDevices: make(map[string][]string),
}
registeredDevs := make(map[string][]string)
for resource, devices := range m.healthyDevices {
data.RegisteredDevices[resource] = devices.UnsortedList()
registeredDevs[resource] = devices.UnsortedList()
}
data := checkpoint.New(m.podDevices.toCheckpointData(),
registeredDevs)
m.mutex.Unlock()
dataJSON, err := json.Marshal(data)
err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
if err != nil {
return err
}
err = m.store.Write(kubeletDeviceManagerCheckpoint, dataJSON)
if err != nil {
return fmt.Errorf("failed to write deviceplugin checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
return fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
}
return nil
}
@ -478,29 +487,28 @@ func (m *ManagerImpl) writeCheckpoint() error {
// Reads device to container allocation information from disk, and populates
// m.allocatedDevices accordingly.
func (m *ManagerImpl) readCheckpoint() error {
content, err := m.store.Read(kubeletDeviceManagerCheckpoint)
registeredDevs := make(map[string][]string)
devEntries := make([]checkpoint.PodDevicesEntry, 0)
cp := checkpoint.New(devEntries, registeredDevs)
err := m.checkpointManager.GetCheckpoint(kubeletDeviceManagerCheckpoint, cp)
if err != nil {
if err == utilstore.ErrKeyNotFound {
if err == errors.ErrCheckpointNotFound {
glog.Warningf("Failed to retrieve checkpoint for %q: %v", kubeletDeviceManagerCheckpoint, err)
return nil
}
return fmt.Errorf("failed to read checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
return err
}
glog.V(4).Infof("Read checkpoint file %s\n", kubeletDeviceManagerCheckpoint)
var data checkpointData
if err := json.Unmarshal(content, &data); err != nil {
return fmt.Errorf("failed to unmarshal deviceplugin checkpoint data: %v", err)
}
m.mutex.Lock()
defer m.mutex.Unlock()
m.podDevices.fromCheckpointData(data.PodDeviceEntries)
podDevices, registeredDevs := cp.GetData()
m.podDevices.fromCheckpointData(podDevices)
m.allocatedDevices = m.podDevices.devices()
for resource, devices := range data.RegisteredDevices {
// TODO: Support Checkpointing for unhealthy devices as well
for resource := range registeredDevs {
// During start up, creates empty healthyDevices list so that the resource capacity
// will stay zero till the corresponding device plugin re-registers.
m.healthyDevices[resource] = sets.NewString()
for _, dev := range devices {
m.healthyDevices[resource].Insert(dev)
}
m.unhealthyDevices[resource] = sets.NewString()
m.endpoints[resource] = newStoppedEndpointImpl(resource, make(map[string]pluginapi.Device))
}
return nil
}
@ -688,6 +696,8 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
return m.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name), nil
}
// callPreStartContainerIfNeeded issues PreStartContainer grpc call for device plugin resource
// with PreStartRequired option set.
func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource string) error {
m.mutex.Lock()
opts, ok := m.pluginOpts[resource]

View File

@ -21,7 +21,7 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
)
// ManagerStub provides a simple stub implementation for the Device Manager.

View File

@ -17,7 +17,6 @@ limitations under the License.
package devicemanager
import (
"flag"
"fmt"
"io/ioutil"
"os"
@ -34,10 +33,9 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
)
const (
@ -69,6 +67,29 @@ func TestNewManagerImplStart(t *testing.T) {
defer os.RemoveAll(socketDir)
m, p := setup(t, []*pluginapi.Device{}, func(n string, a, u, r []pluginapi.Device) {}, socketName, pluginSocketName)
cleanup(t, m, p)
// Stop should tolerate being called more than once.
cleanup(t, m, p)
}
func TestNewManagerImplStop(t *testing.T) {
socketDir, socketName, pluginSocketName, err := tmpSocketDir()
require.NoError(t, err)
defer os.RemoveAll(socketDir)
m, err := newManagerImpl(socketName)
require.NoError(t, err)
// No prior Start, but that should be okay.
err = m.Stop()
require.NoError(t, err)
devs := []*pluginapi.Device{
{ID: "Dev1", Health: pluginapi.Healthy},
{ID: "Dev2", Health: pluginapi.Healthy},
}
p := NewDevicePluginStub(devs, pluginSocketName)
// Same here.
err = p.Stop()
require.NoError(t, err)
}
// Tests that the device plugin manager correctly handles registration and re-registration by
@ -192,7 +213,8 @@ func TestUpdateCapacityAllocatable(t *testing.T) {
// Adds three devices for resource1, two healthy and one unhealthy.
// Expects capacity for resource1 to be 2.
resourceName1 := "domain1.com/resource1"
testManager.endpoints[resourceName1] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
e1 := &endpointImpl{devices: make(map[string]pluginapi.Device)}
testManager.endpoints[resourceName1] = e1
callback(resourceName1, devs, []pluginapi.Device{}, []pluginapi.Device{})
capacity, allocatable, removedResources := testManager.GetCapacity()
resource1Capacity, ok := capacity[v1.ResourceName(resourceName1)]
@ -240,7 +262,8 @@ func TestUpdateCapacityAllocatable(t *testing.T) {
// Tests adding another resource.
resourceName2 := "resource2"
testManager.endpoints[resourceName2] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
e2 := &endpointImpl{devices: make(map[string]pluginapi.Device)}
testManager.endpoints[resourceName2] = e2
callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
capacity, allocatable, removedResources = testManager.GetCapacity()
as.Equal(2, len(capacity))
@ -252,9 +275,9 @@ func TestUpdateCapacityAllocatable(t *testing.T) {
as.Equal(int64(2), resource2Allocatable.Value())
as.Equal(0, len(removedResources))
// Removes resourceName1 endpoint. Verifies testManager.GetCapacity() reports that resourceName1
// Expires resourceName1 endpoint. Verifies testManager.GetCapacity() reports that resourceName1
// is removed from capacity and it no longer exists in healthyDevices after the call.
delete(testManager.endpoints, resourceName1)
e1.setStopTime(time.Now().Add(-1*endpointStopGracePeriod - time.Duration(10)*time.Second))
capacity, allocatable, removed := testManager.GetCapacity()
as.Equal([]string{resourceName1}, removed)
_, ok = capacity[v1.ResourceName(resourceName1)]
@ -266,9 +289,49 @@ func TestUpdateCapacityAllocatable(t *testing.T) {
as.False(ok)
_, ok = testManager.unhealthyDevices[resourceName1]
as.False(ok)
fmt.Println("removed: ", removed)
as.Equal(1, len(removed))
_, ok = testManager.endpoints[resourceName1]
as.False(ok)
as.Equal(1, len(testManager.endpoints))
// Stops resourceName2 endpoint. Verifies its stopTime is set, allocate and
// preStartContainer calls return errors.
e2.stop()
as.False(e2.stopTime.IsZero())
_, err = e2.allocate([]string{"Device1"})
reflect.DeepEqual(err, fmt.Errorf(errEndpointStopped, e2))
_, err = e2.preStartContainer([]string{"Device1"})
reflect.DeepEqual(err, fmt.Errorf(errEndpointStopped, e2))
// Marks resourceName2 unhealthy and verifies its capacity/allocatable are
// correctly updated.
testManager.markResourceUnhealthy(resourceName2)
capacity, allocatable, removed = testManager.GetCapacity()
val, ok = capacity[v1.ResourceName(resourceName2)]
as.True(ok)
as.Equal(int64(3), val.Value())
val, ok = allocatable[v1.ResourceName(resourceName2)]
as.True(ok)
as.Equal(int64(0), val.Value())
as.Empty(removed)
// Writes and re-reads checkpoints. Verifies we create a stopped endpoint
// for resourceName2, its capacity is set to zero, and we still consider
// it as a DevicePlugin resource. This makes sure any pod that was scheduled
// during the time of propagating capacity change to the scheduler will be
// properly rejected instead of being incorrectly started.
err = testManager.writeCheckpoint()
as.Nil(err)
testManager.healthyDevices = make(map[string]sets.String)
testManager.unhealthyDevices = make(map[string]sets.String)
err = testManager.readCheckpoint()
as.Nil(err)
as.Equal(1, len(testManager.endpoints))
_, ok = testManager.endpoints[resourceName2]
as.True(ok)
capacity, allocatable, removed = testManager.GetCapacity()
val, ok = capacity[v1.ResourceName(resourceName2)]
as.True(ok)
as.Equal(int64(0), val.Value())
as.Empty(removed)
as.True(testManager.isDevicePluginResource(resourceName2))
}
func constructDevices(devices []string) sets.String {
@ -305,18 +368,19 @@ func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.Cont
func TestCheckpoint(t *testing.T) {
resourceName1 := "domain1.com/resource1"
resourceName2 := "domain2.com/resource2"
as := assert.New(t)
tmpDir, err := ioutil.TempDir("", "checkpoint")
as.Nil(err)
defer os.RemoveAll(tmpDir)
ckm, err := checkpointmanager.NewCheckpointManager(tmpDir)
as.Nil(err)
testManager := &ManagerImpl{
socketdir: tmpDir,
healthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
endpoints: make(map[string]endpoint),
healthyDevices: make(map[string]sets.String),
unhealthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
checkpointManager: ckm,
}
testManager.store, _ = utilstore.NewFileStore("/tmp/", utilfs.DefaultFs{})
testManager.podDevices.insert("pod1", "con1", resourceName1,
constructDevices([]string{"dev1", "dev2"}),
@ -414,6 +478,10 @@ func (m *MockEndpoint) allocate(devs []string) (*pluginapi.AllocateResponse, err
return nil, nil
}
func (m *MockEndpoint) isStopped() bool { return false }
func (m *MockEndpoint) stopGracePeriodExpired() bool { return false }
func makePod(limits v1.ResourceList) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
@ -431,20 +499,25 @@ func makePod(limits v1.ResourceList) *v1.Pod {
}
}
func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestResource, opts map[string]*pluginapi.DevicePluginOptions) *ManagerImpl {
func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestResource, opts map[string]*pluginapi.DevicePluginOptions) (*ManagerImpl, error) {
monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
testManager := &ManagerImpl{
socketdir: tmpDir,
callback: monitorCallback,
healthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
endpoints: make(map[string]endpoint),
pluginOpts: opts,
podDevices: make(podDevices),
activePods: activePods,
sourcesReady: &sourcesReadyStub{},
ckm, err := checkpointmanager.NewCheckpointManager(tmpDir)
if err != nil {
return nil, err
}
testManager := &ManagerImpl{
socketdir: tmpDir,
callback: monitorCallback,
healthyDevices: make(map[string]sets.String),
unhealthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
endpoints: make(map[string]endpoint),
pluginOpts: opts,
podDevices: make(podDevices),
activePods: activePods,
sourcesReady: &sourcesReadyStub{},
checkpointManager: ckm,
}
testManager.store, _ = utilstore.NewFileStore("/tmp/", utilfs.DefaultFs{})
for _, res := range testRes {
testManager.healthyDevices[res.resourceName] = sets.NewString()
for _, dev := range res.devs {
@ -476,7 +549,7 @@ func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestReso
}
}
}
return testManager
return testManager, nil
}
func getTestNodeInfo(allocatable v1.ResourceList) *schedulercache.NodeInfo {
@ -497,7 +570,6 @@ type TestResource struct {
}
func TestPodContainerDeviceAllocation(t *testing.T) {
flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
res1 := TestResource{
resourceName: "domain1.com/resource1",
resourceQuantity: *resource.NewQuantity(int64(2), resource.DecimalSI),
@ -520,7 +592,8 @@ func TestPodContainerDeviceAllocation(t *testing.T) {
defer os.RemoveAll(tmpDir)
nodeInfo := getTestNodeInfo(v1.ResourceList{})
pluginOpts := make(map[string]*pluginapi.DevicePluginOptions)
testManager := getTestManager(tmpDir, podsStub.getActivePods, testResources, pluginOpts)
testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources, pluginOpts)
as.Nil(err)
testPods := []*v1.Pod{
makePod(v1.ResourceList{
@ -615,7 +688,8 @@ func TestInitContainerDeviceAllocation(t *testing.T) {
as.Nil(err)
defer os.RemoveAll(tmpDir)
pluginOpts := make(map[string]*pluginapi.DevicePluginOptions)
testManager := getTestManager(tmpDir, podsStub.getActivePods, testResources, pluginOpts)
testManager, err := getTestManager(tmpDir, podsStub.getActivePods, testResources, pluginOpts)
as.Nil(err)
podWithPluginResourcesInInitContainers := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
@ -693,14 +767,18 @@ func TestSanitizeNodeAllocatable(t *testing.T) {
as := assert.New(t)
monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
tmpDir, err := ioutil.TempDir("", "checkpoint")
as.Nil(err)
ckm, err := checkpointmanager.NewCheckpointManager(tmpDir)
as.Nil(err)
testManager := &ManagerImpl{
callback: monitorCallback,
healthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
callback: monitorCallback,
allocatedDevices: make(map[string]sets.String),
healthyDevices: make(map[string]sets.String),
podDevices: make(podDevices),
checkpointManager: ckm,
}
testManager.store, _ = utilstore.NewFileStore("/tmp/", utilfs.DefaultFs{})
// require one of resource1 and one of resource2
testManager.allocatedDevices[resourceName1] = sets.NewString()
testManager.allocatedDevices[resourceName1].Insert(devID1)
@ -747,7 +825,8 @@ func TestDevicePreStartContainer(t *testing.T) {
pluginOpts := make(map[string]*pluginapi.DevicePluginOptions)
pluginOpts[res1.resourceName] = &pluginapi.DevicePluginOptions{PreStartRequired: true}
testManager := getTestManager(tmpDir, podsStub.getActivePods, []TestResource{res1}, pluginOpts)
testManager, err := getTestManager(tmpDir, podsStub.getActivePods, []TestResource{res1}, pluginOpts)
as.Nil(err)
ch := make(chan []string, 1)
testManager.endpoints[res1.resourceName] = &MockEndpoint{

View File

@ -21,6 +21,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
@ -126,18 +127,9 @@ func (pdev podDevices) devices() map[string]sets.String {
return ret
}
// podDevicesCheckpointEntry is used to record <pod, container> to device allocation information.
type podDevicesCheckpointEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs []string
AllocResp []byte
}
// Turns podDevices to checkpointData.
func (pdev podDevices) toCheckpointData() []podDevicesCheckpointEntry {
var data []podDevicesCheckpointEntry
func (pdev podDevices) toCheckpointData() []checkpoint.PodDevicesEntry {
var data []checkpoint.PodDevicesEntry
for podUID, containerDevices := range pdev {
for conName, resources := range containerDevices {
for resource, devices := range resources {
@ -152,7 +144,12 @@ func (pdev podDevices) toCheckpointData() []podDevicesCheckpointEntry {
glog.Errorf("Can't marshal allocResp for %v %v %v: %v", podUID, conName, resource, err)
continue
}
data = append(data, podDevicesCheckpointEntry{podUID, conName, resource, devIds, allocResp})
data = append(data, checkpoint.PodDevicesEntry{
PodUID: podUID,
ContainerName: conName,
ResourceName: resource,
DeviceIDs: devIds,
AllocResp: allocResp})
}
}
}
@ -160,7 +157,7 @@ func (pdev podDevices) toCheckpointData() []podDevicesCheckpointEntry {
}
// Populates podDevices from the passed in checkpointData.
func (pdev podDevices) fromCheckpointData(data []podDevicesCheckpointEntry) {
func (pdev podDevices) fromCheckpointData(data []checkpoint.PodDevicesEntry) {
for _, entry := range data {
glog.V(2).Infof("Get checkpoint entry: %v %v %v %v %v\n",
entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceIDs, entry.AllocResp)

View File

@ -17,12 +17,14 @@ limitations under the License.
package devicemanager
import (
"time"
"k8s.io/api/core/v1"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
)
// Manager manages all the Device Plugins running on a node.
@ -86,6 +88,8 @@ const (
errInvalidResourceName = "the ResourceName %q is invalid"
// errEmptyResourceName is the error raised when the resource name field is empty
errEmptyResourceName = "invalid Empty ResourceName"
// errEndpointStopped indicates that the endpoint has been stopped
errEndpointStopped = "endpoint %v has been stopped"
// errBadSocket is the error raised when the registry socket path is not absolute
errBadSocket = "bad socketPath, must be an absolute path:"
@ -96,3 +100,12 @@ const (
// errListAndWatch is the error raised when ListAndWatch ended unsuccessfully
errListAndWatch = "listAndWatch ended unexpectedly for device plugin %s with error %v"
)
// endpointStopGracePeriod indicates the grace period after an endpoint is stopped
// because its device plugin fails. DeviceManager keeps the stopped endpoint in its
// cache during this grace period to cover the time gap for the capacity change to
// take effect.
const endpointStopGracePeriod = time.Duration(5) * time.Minute
// kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint
const kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"

View File

@ -103,7 +103,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool) *ResourceConfig {
// sum requests and limits.
reqs, limits := resource.PodRequestsAndLimits(pod)
@ -146,6 +146,11 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
}
}
// quota is not capped when cfs quota is disabled
if !enforceCPULimits {
cpuQuota = int64(-1)
}
// determine the qos class
qosClass := v1qos.GetPodQOS(pod)

View File

@ -24,6 +24,7 @@ import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"strconv"
)
// getResourceList returns a ResourceList with the
@ -57,10 +58,12 @@ func TestResourceConfigForPod(t *testing.T) {
guaranteedShares := MilliCPUToShares(100)
guaranteedQuota, guaranteedPeriod := MilliCPUToQuota(100)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
testCases := map[string]struct {
pod *v1.Pod
expected *ResourceConfig
pod *v1.Pod
expected *ResourceConfig
enforceCPULimits bool
}{
"besteffort": {
pod: &v1.Pod{
@ -72,7 +75,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
},
expected: &ResourceConfig{CpuShares: &minShares},
enforceCPULimits: true,
expected: &ResourceConfig{CpuShares: &minShares},
},
"burstable-no-limits": {
pod: &v1.Pod{
@ -84,7 +88,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares},
enforceCPULimits: true,
expected: &ResourceConfig{CpuShares: &burstableShares},
},
"burstable-with-limits": {
pod: &v1.Pod{
@ -96,7 +101,21 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &burstablePeriod, Memory: &burstableMemory},
enforceCPULimits: true,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &burstablePeriod, Memory: &burstableMemory},
},
"burstable-with-limits-no-cpu-enforcement": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
enforceCPULimits: false,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &burstablePeriod, Memory: &burstableMemory},
},
"burstable-partial-limits": {
pod: &v1.Pod{
@ -111,7 +130,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
},
expected: &ResourceConfig{CpuShares: &burstablePartialShares},
enforceCPULimits: true,
expected: &ResourceConfig{CpuShares: &burstablePartialShares},
},
"guaranteed": {
pod: &v1.Pod{
@ -123,11 +143,25 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
},
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedPeriod, Memory: &guaranteedMemory},
enforceCPULimits: true,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
enforceCPULimits: false,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedPeriod, Memory: &guaranteedMemory},
},
}
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits)
if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) {
t.Errorf("unexpected result, test: %v, cpu period not as expected", testName)
}
@ -197,3 +231,82 @@ func TestMilliCPUToQuota(t *testing.T) {
}
}
}
func TestHugePageLimits(t *testing.T) {
Mi := int64(1024 * 1024)
type inputStruct struct {
key string
input string
}
testCases := []struct {
name string
inputs []inputStruct
expected map[int64]int64
}{
{
name: "no valid hugepages",
inputs: []inputStruct{
{
key: "2Mi",
input: "128",
},
},
expected: map[int64]int64{},
},
{
name: "2Mi only",
inputs: []inputStruct{
{
key: v1.ResourceHugePagesPrefix + "2Mi",
input: "128",
},
},
expected: map[int64]int64{2 * Mi: 128},
},
{
name: "2Mi and 4Mi",
inputs: []inputStruct{
{
key: v1.ResourceHugePagesPrefix + "2Mi",
input: "128",
},
{
key: v1.ResourceHugePagesPrefix + strconv.FormatInt(2*Mi, 10),
input: "256",
},
{
key: v1.ResourceHugePagesPrefix + "4Mi",
input: "512",
},
{
key: "4Mi",
input: "1024",
},
},
expected: map[int64]int64{2 * Mi: 384, 4 * Mi: 512},
},
}
for _, testcase := range testCases {
t.Run(testcase.name, func(t *testing.T) {
resourceList := v1.ResourceList{}
for _, input := range testcase.inputs {
value, err := resource.ParseQuantity(input.input)
if err != nil {
t.Fatalf("error in parsing hugepages, value: %s", input.input)
} else {
resourceList[v1.ResourceName(input.key)] = value
}
}
resultValue := HugePageLimits(resourceList)
if !reflect.DeepEqual(testcase.expected, resultValue) {
t.Errorf("unexpected result, expected: %v, actual: %v", testcase.expected, resultValue)
}
})
}
}

View File

@ -43,7 +43,7 @@ func MilliCPUToShares(milliCPU int64) int64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool) *ResourceConfig {
return nil
}

View File

@ -39,9 +39,10 @@ const (
defaultNodeAllocatableCgroupName = "kubepods"
)
//createNodeAllocatableCgroups creates Node Allocatable Cgroup when CgroupsPerQOS flag is specified as true
func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
Name: cm.cgroupRoot,
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: getCgroupConfig(cm.capacity),
}
@ -70,7 +71,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc)
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
Name: cm.cgroupRoot,
ResourceParameters: getCgroupConfig(nodeAllocatable),
}
@ -83,11 +84,12 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
}
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher that current Node Allocatable Memory Limits.
// existing memory usage across pods might be higher than current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
if cm.cgroupRoot != "/" {
// Check if cgroupRoot is set to a non-empty value (empty would be the root container)
if len(cm.cgroupRoot) > 0 {
go func() {
for {
err := cm.cgroupManager.Update(cgroupConfig)
@ -104,7 +106,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
glog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil {
if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.SystemReservedCgroupName), nc.SystemReserved); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
@ -113,7 +115,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
}
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil {
if err := enforceExistingCgroup(cm.cgroupManager, ParseCgroupfsToCgroupName(nc.KubeReservedCgroupName), nc.KubeReserved); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
@ -124,9 +126,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error {
func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cName),
Name: cName,
ResourceParameters: getCgroupConfig(rl),
}
glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory)

View File

@ -49,6 +49,8 @@ type podContainerManagerImpl struct {
cgroupManager CgroupManager
// Maximum number of pids in a pod
podPidsLimit int64
// enforceCPULimits controls whether cfs quota is enforced or not
enforceCPULimits bool
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
@ -79,7 +81,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
// Create the pod container
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod),
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
@ -102,7 +104,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
podQOS := v1qos.GetPodQOS(pod)
// Get the parent QOS container name
var parentContainer string
var parentContainer CgroupName
switch podQOS {
case v1.PodQOSGuaranteed:
parentContainer = m.qosContainersInfo.Guaranteed
@ -114,13 +116,33 @@ func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName,
podContainer := GetPodCgroupNameSuffix(pod.UID)
// Get the absolute path of the cgroup
cgroupName := (CgroupName)(path.Join(parentContainer, podContainer))
cgroupName := NewCgroupName(parentContainer, podContainer)
// Get the literal cgroupfs name
cgroupfsName := m.cgroupManager.Name(cgroupName)
return cgroupName, cgroupfsName
}
// Kill one process ID
func (m *podContainerManagerImpl) killOnePid(pid int) error {
// os.FindProcess never returns an error on POSIX
// https://go-review.googlesource.com/c/go/+/19093
p, _ := os.FindProcess(pid)
if err := p.Kill(); err != nil {
// If the process already exited, that's fine.
if strings.Contains(err.Error(), "process already finished") {
// Hate parsing strings, but
// vendor/github.com/opencontainers/runc/libcontainer/
// also does this.
glog.V(3).Infof("process with pid %v no longer exists", pid)
return nil
} else {
return err
}
}
return nil
}
// Scan through the whole cgroup directory and kill all processes either
// attached to the pod cgroup or to a container cgroup under the pod cgroup
func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
@ -139,13 +161,8 @@ func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName
}
errlist = []error{}
for _, pid := range pidsToKill {
p, err := os.FindProcess(pid)
if err != nil {
// Process not running anymore, do nothing
continue
}
glog.V(3).Infof("Attempt to kill process with pid: %v", pid)
if err := p.Kill(); err != nil {
if err := m.killOnePid(pid); err != nil {
glog.V(3).Infof("failed to kill process with pid: %v", pid)
errlist = append(errlist, err)
}
@ -182,12 +199,37 @@ func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
return m.cgroupManager.ReduceCPULimits(podCgroup)
}
// IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) {
// convert the literal cgroupfs form to the driver specific value
cgroupName := m.cgroupManager.CgroupName(cgroupfs)
qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
basePath := ""
for _, qosContainerName := range qosContainersList {
// a pod cgroup is a direct child of a qos node, so check if its a match
if len(cgroupName) == len(qosContainerName)+1 {
basePath = cgroupName[len(qosContainerName)]
}
}
if basePath == "" {
return false, types.UID("")
}
if !strings.HasPrefix(basePath, podCgroupNamePrefix) {
return false, types.UID("")
}
parts := strings.Split(basePath, podCgroupNamePrefix)
if len(parts) != 2 {
return false, types.UID("")
}
return true, types.UID(parts[1])
}
// GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
// Get list of pods whose cgroup still exist on the cgroup mounts
func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
// Map for storing all the found pods on the disk
foundPods := make(map[types.UID]CgroupName)
qosContainersList := [3]string{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
// Scan through all the subsystem mounts
// and through each QoS cgroup directory for each subsystem mount
// If a pod cgroup exists in even a single subsystem mount
@ -195,7 +237,7 @@ func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupN
for _, val := range m.subsystems.MountPoints {
for _, qosContainerName := range qosContainersList {
// get the subsystems QoS cgroup absolute name
qcConversion := m.cgroupManager.Name(CgroupName(qosContainerName))
qcConversion := m.cgroupManager.Name(qosContainerName)
qc := path.Join(val, qcConversion)
dirInfo, err := ioutil.ReadDir(qc)
if err != nil {
@ -217,7 +259,7 @@ func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupN
internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
// we only care about base segment of the converted path since that
// is what we are reading currently to know if it is a pod or not.
basePath := path.Base(string(internalPath))
basePath := internalPath[len(internalPath)-1]
if !strings.Contains(basePath, podCgroupNamePrefix) {
continue
}
@ -257,7 +299,7 @@ func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
}
func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return m.cgroupRoot, string(m.cgroupRoot)
return m.cgroupRoot, m.cgroupRoot.ToCgroupfs()
}
func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
@ -276,3 +318,7 @@ func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) {
return false, types.UID("")
}

View File

@ -0,0 +1,125 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"strings"
"testing"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
func TestIsCgroupPod(t *testing.T) {
qosContainersInfo := QOSContainersInfo{
Guaranteed: RootCgroupName,
Burstable: NewCgroupName(RootCgroupName, strings.ToLower(string(v1.PodQOSBurstable))),
BestEffort: NewCgroupName(RootCgroupName, strings.ToLower(string(v1.PodQOSBestEffort))),
}
podUID := types.UID("123")
testCases := []struct {
input CgroupName
expectedResult bool
expectedUID types.UID
}{
{
input: RootCgroupName,
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.Guaranteed),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.Guaranteed, GetPodCgroupNameSuffix(podUID)),
expectedResult: true,
expectedUID: podUID,
},
{
input: NewCgroupName(qosContainersInfo.Guaranteed, GetPodCgroupNameSuffix(podUID), "container.scope"),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.Burstable),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.Burstable, GetPodCgroupNameSuffix(podUID)),
expectedResult: true,
expectedUID: podUID,
},
{
input: NewCgroupName(qosContainersInfo.Burstable, GetPodCgroupNameSuffix(podUID), "container.scope"),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.BestEffort),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(qosContainersInfo.BestEffort, GetPodCgroupNameSuffix(podUID)),
expectedResult: true,
expectedUID: podUID,
},
{
input: NewCgroupName(qosContainersInfo.BestEffort, GetPodCgroupNameSuffix(podUID), "container.scope"),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(RootCgroupName, "system"),
expectedResult: false,
expectedUID: types.UID(""),
},
{
input: NewCgroupName(RootCgroupName, "system", "kubelet"),
expectedResult: false,
expectedUID: types.UID(""),
},
}
for _, cgroupDriver := range []string{"cgroupfs", "systemd"} {
pcm := &podContainerManagerImpl{
cgroupManager: NewCgroupManager(nil, cgroupDriver),
enforceCPULimits: true,
qosContainersInfo: qosContainersInfo,
}
for _, testCase := range testCases {
// give the right cgroup structure based on driver
cgroupfs := testCase.input.ToCgroupfs()
if cgroupDriver == "systemd" {
cgroupfs = testCase.input.ToSystemd()
}
// check if this is a pod or not with the literal cgroupfs input
result, resultUID := pcm.IsPodCgroup(cgroupfs)
if result != testCase.expectedResult {
t.Errorf("Unexpected result for driver: %v, input: %v, expected: %v, actual: %v", cgroupDriver, testCase.input, testCase.expectedResult, result)
}
if resultUID != testCase.expectedUID {
t.Errorf("Unexpected result for driver: %v, input: %v, expected: %v, actual: %v", cgroupDriver, testCase.input, testCase.expectedUID, resultUID)
}
}
}
}

View File

@ -35,7 +35,7 @@ func (m *podContainerManagerStub) EnsureExists(_ *v1.Pod) error {
}
func (m *podContainerManagerStub) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return "", ""
return nil, ""
}
func (m *podContainerManagerStub) Destroy(_ CgroupName) error {
@ -49,3 +49,7 @@ func (m *podContainerManagerStub) ReduceCPULimits(_ CgroupName) error {
func (m *podContainerManagerStub) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *podContainerManagerStub) IsPodCgroup(cgroupfs string) (bool, types.UID) {
return false, types.UID("")
}

View File

@ -18,7 +18,6 @@ package cm
import (
"fmt"
"path"
"strings"
"sync"
"time"
@ -60,18 +59,18 @@ type qosContainerManagerImpl struct {
qosReserved map[v1.ResourceName]int64
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
if !nodeConfig.CgroupsPerQOS {
return &qosContainerManagerNoop{
cgroupRoot: CgroupName(cgroupRoot),
cgroupRoot: cgroupRoot,
}, nil
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: CgroupName(cgroupRoot),
qosReserved: nodeConfig.ExperimentalQOSReserved,
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.QOSReserved,
}, nil
}
@ -81,23 +80,20 @@ func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := string(m.cgroupRoot)
if !cm.Exists(CgroupName(rootContainer)) {
return fmt.Errorf("root container %s doesn't exist", rootContainer)
rootContainer := m.cgroupRoot
if !cm.Exists(rootContainer) {
return fmt.Errorf("root container %v doesn't exist", rootContainer)
}
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := map[v1.PodQOSClass]string{
v1.PodQOSBurstable: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
qosClasses := map[v1.PodQOSClass]CgroupName{
v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
}
// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
// get the container's absolute name
absoluteContainerName := CgroupName(containerName)
resourceParameters := &ResourceConfig{}
// the BestEffort QoS class has a statically configured minShares value
if qosClass == v1.PodQOSBestEffort {
@ -107,7 +103,7 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: absoluteContainerName,
Name: containerName,
ResourceParameters: resourceParameters,
}
@ -117,7 +113,7 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
}
// check if it exists
if !cm.Exists(absoluteContainerName) {
if !cm.Exists(containerName) {
if err := cm.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
}
@ -279,11 +275,11 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSBurstable: {
Name: CgroupName(m.qosContainersInfo.Burstable),
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: CgroupName(m.qosContainersInfo.BestEffort),
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
@ -300,31 +296,34 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
}
}
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
}
@ -336,7 +335,7 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
}
}
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry")
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}

View File

@ -38,7 +38,9 @@ type ResourceConfig struct {
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
type CgroupName string
// It is specified as a list of strings from its individual components, such as:
// {"kubepods", "burstable", "pod1234-abcd-5678-efgh"}
type CgroupName []string
// CgroupConfig holds the cgroup configuration information.
// This is common object which is used to specify
@ -78,7 +80,7 @@ type CgroupManager interface {
Exists(name CgroupName) bool
// Name returns the literal cgroupfs name on the host after any driver specific conversions.
// We would expect systemd implementation to make appropriate name conversion.
// For example, if we pass /foo/bar
// For example, if we pass {"foo", "bar"}
// then systemd should convert the name to something like
// foo.slice/foo-bar.slice
Name(name CgroupName) string
@ -94,9 +96,9 @@ type CgroupManager interface {
// QOSContainersInfo stores the names of containers per qos
type QOSContainersInfo struct {
Guaranteed string
BestEffort string
Burstable string
Guaranteed CgroupName
BestEffort CgroupName
Burstable CgroupName
}
// PodContainerManager stores and manages pod level containers
@ -122,4 +124,7 @@ type PodContainerManager interface {
// GetAllPodsFromCgroups enumerates the set of pod uids to their associated cgroup based on state of cgroupfs system.
GetAllPodsFromCgroups() (map[types.UID]CgroupName, error)
// IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
IsPodCgroup(cgroupfs string) (bool, types.UID)
}