vendor files

This commit is contained in:
Serguei Bezverkhi
2018-01-09 13:57:14 -05:00
parent 558bc6c02a
commit 7b24313bd6
16547 changed files with 4527373 additions and 0 deletions

126
vendor/k8s.io/kubernetes/pkg/kubelet/cm/BUILD generated vendored Normal file
View File

@ -0,0 +1,126 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"cgroup_manager_unsupported.go",
"container_manager.go",
"container_manager_stub.go",
"container_manager_unsupported.go",
"fake_internal_container_lifecycle.go",
"helpers_unsupported.go",
"internal_container_lifecycle.go",
"pod_container_manager_stub.go",
"pod_container_manager_unsupported.go",
"types.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroup_manager_linux.go",
"container_manager_linux.go",
"helpers_linux.go",
"node_container_manager.go",
"pod_container_manager_linux.go",
"qos_container_manager_linux.go",
],
"@io_bazel_rules_go//go/platform:windows_amd64": [
"container_manager_windows.go",
],
"//conditions:default": [],
}),
importpath = "k8s.io/kubernetes/pkg/kubelet/cm",
visibility = ["//visibility:public"],
deps = [
"//pkg/features:go_default_library",
"//pkg/kubelet/apis/cri:go_default_library",
"//pkg/kubelet/cadvisor:go_default_library",
"//pkg/kubelet/cm/cpumanager:go_default_library",
"//pkg/kubelet/config:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//pkg/util/mount:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/api/v1/resource:go_default_library",
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/apis/core/v1/helper/qos:go_default_library",
"//pkg/kubelet/cm/deviceplugin:go_default_library",
"//pkg/kubelet/cm/util:go_default_library",
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/util/file:go_default_library",
"//pkg/util/oom:go_default_library",
"//pkg/util/procfs:go_default_library",
"//pkg/util/sysctl:go_default_library",
"//pkg/util/version:go_default_library",
"//vendor/github.com/docker/go-units:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
],
"//conditions:default": [],
}),
)
go_test(
name = "go_default_test",
srcs = [
"container_manager_unsupported_test.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroup_manager_linux_test.go",
"cgroup_manager_test.go",
"container_manager_linux_test.go",
"helpers_linux_test.go",
"node_container_manager_test.go",
],
"//conditions:default": [],
}),
importpath = "k8s.io/kubernetes/pkg/kubelet/cm",
library = ":go_default_library",
deps = [
"//pkg/util/mount:go_default_library",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/kubelet/eviction/api:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/github.com/stretchr/testify/require:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
],
"//conditions:default": [],
}),
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/cm/cpumanager:all-srcs",
"//pkg/kubelet/cm/cpuset:all-srcs",
"//pkg/kubelet/cm/deviceplugin:all-srcs",
"//pkg/kubelet/cm/util:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,582 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strings"
"time"
units "github.com/docker/go-units"
"github.com/golang/glog"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
// libcontainerCgroupManagerType defines how to interface with libcontainer
type libcontainerCgroupManagerType string
const (
// libcontainerCgroupfs means use libcontainer with cgroupfs
libcontainerCgroupfs libcontainerCgroupManagerType = "cgroupfs"
// libcontainerSystemd means use libcontainer with systemd
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
)
// hugePageSizeList is useful for converting to the hugetlb canonical unit
// which is what is expected when interacting with libcontainer
var hugePageSizeList = []string{"B", "kB", "MB", "GB", "TB", "PB"}
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
// For example, it will return /Burstable.slice/Burstable-pod_123_456.slice in above scenario.
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
name := string(cgroupName)
result := ""
if name != "" && name != "/" {
parts := strings.Split(name, "/")
results := []string{}
for _, part := range parts {
// ignore leading stuff
if part == "" {
continue
}
// detect if we are given a systemd style name.
// if so, we do not want to do double encoding.
if IsSystemdStyleName(part) {
part = strings.TrimSuffix(part, systemdSuffix)
separatorIndex := strings.LastIndex(part, "-")
if separatorIndex >= 0 && separatorIndex < len(part) {
part = part[separatorIndex+1:]
}
} else {
// systemd treats - as a step in the hierarchy, we convert all - to _
part = strings.Replace(part, "-", "_", -1)
}
results = append(results, part)
}
// each part is appended with systemd style -
result = strings.Join(results, "-")
} else {
// root converts to -
result = "-"
}
// always have a .slice suffix
if !IsSystemdStyleName(result) {
result = result + systemdSuffix
}
// if the caller desired the result in cgroupfs format...
if outputToCgroupFs {
var err error
result, err = cgroupsystemd.ExpandSlice(result)
if err != nil {
panic(fmt.Errorf("error adapting cgroup name, input: %v, err: %v", name, err))
}
}
return result
}
// ConvertCgroupFsNameToSystemd converts an expanded cgroupfs name to its systemd name.
// For example, it will convert test.slice/test-a.slice/test-a-b.slice to become test-a-b.slice
// NOTE: this is public right now to allow its usage in dockermanager and dockershim, ideally both those
// code areas could use something from libcontainer if we get this style function upstream.
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
// TODO: see if libcontainer systemd implementation could use something similar, and if so, move
// this function up to that library. At that time, it would most likely do validation specific to systemd
// above and beyond the simple assumption here that the base of the path encodes the hierarchy
// per systemd convention.
return path.Base(cgroupfsName), nil
}
func IsSystemdStyleName(name string) bool {
if strings.HasSuffix(name, systemdSuffix) {
return true
}
return false
}
// libcontainerAdapter provides a simplified interface to libcontainer based on libcontainer type.
type libcontainerAdapter struct {
// cgroupManagerType defines how to interface with libcontainer
cgroupManagerType libcontainerCgroupManagerType
}
// newLibcontainerAdapter returns a configured libcontainerAdapter for specified manager.
// it does any initialization required by that manager to function.
func newLibcontainerAdapter(cgroupManagerType libcontainerCgroupManagerType) *libcontainerAdapter {
return &libcontainerAdapter{cgroupManagerType: cgroupManagerType}
}
// newManager returns an implementation of cgroups.Manager
func (l *libcontainerAdapter) newManager(cgroups *libcontainerconfigs.Cgroup, paths map[string]string) (libcontainercgroups.Manager, error) {
switch l.cgroupManagerType {
case libcontainerCgroupfs:
return &cgroupfs.Manager{
Cgroups: cgroups,
Paths: paths,
}, nil
case libcontainerSystemd:
// this means you asked systemd to manage cgroups, but systemd was not on the host, so all you can do is panic...
if !cgroupsystemd.UseSystemd() {
panic("systemd cgroup manager not available")
}
return &cgroupsystemd.Manager{
Cgroups: cgroups,
Paths: paths,
}, nil
}
return nil, fmt.Errorf("invalid cgroup manager configuration")
}
func (l *libcontainerAdapter) revertName(name string) CgroupName {
if l.cgroupManagerType != libcontainerSystemd {
return CgroupName(name)
}
return CgroupName(RevertFromSystemdToCgroupStyleName(name))
}
func RevertFromSystemdToCgroupStyleName(name string) string {
driverName, err := ConvertCgroupFsNameToSystemd(name)
if err != nil {
panic(err)
}
driverName = strings.TrimSuffix(driverName, systemdSuffix)
driverName = strings.Replace(driverName, "-", "/", -1)
driverName = strings.Replace(driverName, "_", "-", -1)
return driverName
}
// adaptName converts a CgroupName identifier to a driver specific conversion value.
// if outputToCgroupFs is true, the result is returned in the cgroupfs format rather than the driver specific form.
func (l *libcontainerAdapter) adaptName(cgroupName CgroupName, outputToCgroupFs bool) string {
if l.cgroupManagerType != libcontainerSystemd {
name := string(cgroupName)
return name
}
return ConvertCgroupNameToSystemd(cgroupName, outputToCgroupFs)
}
// CgroupSubsystems holds information about the mounted cgroup subsystems
type CgroupSubsystems struct {
// Cgroup subsystem mounts.
// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
Mounts []libcontainercgroups.Mount
// Cgroup subsystem to their mount location.
// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
MountPoints map[string]string
}
// cgroupManagerImpl implements the CgroupManager interface.
// Its a stateless object which can be used to
// update,create or delete any number of cgroups
// It uses the Libcontainer raw fs cgroup manager for cgroup management.
type cgroupManagerImpl struct {
// subsystems holds information about all the
// mounted cgroup subsystems on the node
subsystems *CgroupSubsystems
// simplifies interaction with libcontainer and its cgroup managers
adapter *libcontainerAdapter
}
// Make sure that cgroupManagerImpl implements the CgroupManager interface
var _ CgroupManager = &cgroupManagerImpl{}
// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
managerType := libcontainerCgroupfs
if cgroupDriver == string(libcontainerSystemd) {
managerType = libcontainerSystemd
}
return &cgroupManagerImpl{
subsystems: cs,
adapter: newLibcontainerAdapter(managerType),
}
}
// Name converts the cgroup to the driver specific value in cgroupfs form.
func (m *cgroupManagerImpl) Name(name CgroupName) string {
return m.adapter.adaptName(name, true)
}
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
func (m *cgroupManagerImpl) CgroupName(name string) CgroupName {
return m.adapter.revertName(name)
}
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string {
cgroupFsAdaptedName := m.Name(name)
cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
for key, val := range m.subsystems.MountPoints {
cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
}
return cgroupPaths
}
// Exists checks if all subsystem cgroups already exist
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
// Get map of all cgroup paths on the system for the particular cgroup
cgroupPaths := m.buildCgroupPaths(name)
// the presence of alternative control groups not known to runc confuses
// the kubelet existence checks.
// ideally, we would have a mechanism in runc to support Exists() logic
// scoped to the set control groups it understands. this is being discussed
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
for controller, path := range cgroupPaths {
// ignore mounts we don't care about
if !whitelistControllers.Has(controller) {
continue
}
if !libcontainercgroups.PathExists(path) {
return false
}
}
return true
}
// Destroy destroys the specified cgroup
func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start))
}()
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
}
manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths)
if err != nil {
return err
}
// Delete cgroups using libcontainers Managers Destroy() method
if err = manager.Destroy(); err != nil {
return fmt.Errorf("Unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
}
return nil
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Set the cgroup represented by cgroup.
Set(path string, cgroup *libcontainerconfigs.Cgroup) error
// GetStats returns the statistics associated with the cgroup
GetStats(path string, stats *libcontainercgroups.Stats) error
}
// getSupportedSubsystems returns list of subsystems supported
func getSupportedSubsystems() []subsystem {
supportedSubsystems := []subsystem{
&cgroupfs.MemoryGroup{},
&cgroupfs.CpuGroup{},
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{})
}
return supportedSubsystems
}
// setSupportedSubsystems sets cgroup resource limits only on the supported
// subsystems. ie. cpu and memory. We don't use libcontainer's cgroup/fs/Set()
// method as it doesn't allow us to skip updates on the devices cgroup
// Allowing or denying all devices by writing 'a' to devices.allow or devices.deny is
// not possible once the device cgroups has children. Once the pod level cgroup are
// created under the QOS level cgroup we cannot update the QOS level device cgroup.
// We would like to skip setting any values on the device cgroup in this case
// but this is not possible with libcontainers Set() method
// See https://github.com/opencontainers/runc/issues/932
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}
if err := sys.Set(cgroupConfig.Paths[sys.Name()], cgroupConfig); err != nil {
return fmt.Errorf("Failed to set config for supported subsystems : %v", err)
}
}
return nil
}
func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
resources := &libcontainerconfigs.Resources{}
if resourceConfig == nil {
return resources
}
if resourceConfig.Memory != nil {
resources.Memory = *resourceConfig.Memory
}
if resourceConfig.CpuShares != nil {
resources.CpuShares = *resourceConfig.CpuShares
}
if resourceConfig.CpuQuota != nil {
resources.CpuQuota = *resourceConfig.CpuQuota
}
if resourceConfig.CpuPeriod != nil {
resources.CpuPeriod = *resourceConfig.CpuPeriod
}
// if huge pages are enabled, we set them in libcontainer
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
// for each page size enumerated, set that value
pageSizes := sets.NewString()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range cgroupfs.HugePageSizes {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
return resources
}
// Update updates the cgroup with the specified Cgroup Configuration
func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start))
}()
// Extract the cgroup resource parameters
resourceConfig := cgroupConfig.ResourceParameters
resources := m.toResources(resourceConfig)
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
Paths: cgroupPaths,
}
if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err)
}
return nil
}
// Create creates the specified cgroup
func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start))
}()
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
resources := m.toResources(cgroupConfig.ResourceParameters)
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
}
// get the manager with the specified cgroup configuration
manager, err := m.adapter.newManager(libcontainerCgroupConfig, nil)
if err != nil {
return err
}
// Apply(-1) is a hack to create the cgroup directories for each resource
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
// configuration to the process with the specified pid.
// It creates cgroup files for each subsystems and writes the pid
// in the tasks file. We use the function to create all the required
// cgroup files but not attach any "real" pid to the cgroup.
if err := manager.Apply(-1); err != nil {
return err
}
// it may confuse why we call set after we do apply, but the issue is that runc
// follows a similar pattern. it's needed to ensure cpu quota is set properly.
m.Update(cgroupConfig)
return nil
}
// Scans through all subsystems to find pids associated with specified cgroup.
func (m *cgroupManagerImpl) Pids(name CgroupName) []int {
// we need the driver specific name
cgroupFsName := m.Name(name)
// Get a list of processes that we need to kill
pidsToKill := sets.NewInt()
var pids []int
for _, val := range m.subsystems.MountPoints {
dir := path.Join(val, cgroupFsName)
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// The subsystem pod cgroup is already deleted
// do nothing, continue
continue
}
// Get a list of pids that are still charged to the pod's cgroup
pids, err = getCgroupProcs(dir)
if err != nil {
continue
}
pidsToKill.Insert(pids...)
// WalkFunc which is called for each file and directory in the pod cgroup dir
visitor := func(path string, info os.FileInfo, err error) error {
if err != nil {
glog.V(4).Infof("cgroup manager encountered error scanning cgroup path %q: %v", path, err)
return filepath.SkipDir
}
if !info.IsDir() {
return nil
}
pids, err = getCgroupProcs(path)
if err != nil {
glog.V(4).Infof("cgroup manager encountered error getting procs for cgroup path %q: %v", path, err)
return filepath.SkipDir
}
pidsToKill.Insert(pids...)
return nil
}
// Walk through the pod cgroup directory to check if
// container cgroups haven't been GCed yet. Get attached processes to
// all such unwanted containers under the pod cgroup
if err = filepath.Walk(dir, visitor); err != nil {
glog.V(4).Infof("cgroup manager encountered error scanning pids for directory: %q: %v", dir, err)
}
}
return pidsToKill.List()
}
// ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
// Set lowest possible CpuShares value for the cgroup
minimumCPUShares := uint64(MinShares)
resources := &ResourceConfig{
CpuShares: &minimumCPUShares,
}
containerConfig := &CgroupConfig{
Name: cgroupName,
ResourceParameters: resources,
}
return m.Update(containerConfig)
}
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
stats := libcontainercgroups.NewStats()
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupPaths[sys.Name()]; !ok {
return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}
if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil {
return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err)
}
}
return stats, nil
}
func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
return &ResourceStats{
MemoryStats: &MemoryStats{
Usage: int64(stats.MemoryStats.Usage.Usage),
},
}
}
// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
cgroupPaths := m.buildCgroupPaths(name)
stats, err := getStatsSupportedSubsystems(cgroupPaths)
if err != nil {
return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
}
return toResourceStats(stats), nil
}

View File

@ -0,0 +1,101 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import "testing"
func TestLibcontainerAdapterAdaptToSystemd(t *testing.T) {
testCases := []struct {
input string
expected string
}{
{
input: "/",
expected: "-.slice",
},
{
input: "/system.slice",
expected: "system.slice",
},
{
input: "/system.slice/Burstable",
expected: "system-Burstable.slice",
},
{
input: "/Burstable.slice/Burstable-pod_123.slice",
expected: "Burstable-pod_123.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice",
expected: "test-a-b.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice/Burstable",
expected: "test-a-b-Burstable.slice",
},
{
input: "/Burstable",
expected: "Burstable.slice",
},
{
input: "/Burstable/pod_123",
expected: "Burstable-pod_123.slice",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
expected: "BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), false); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}
func TestLibcontainerAdapterAdaptToSystemdAsCgroupFs(t *testing.T) {
testCases := []struct {
input string
expected string
}{
{
input: "/",
expected: "/",
},
{
input: "/Burstable",
expected: "Burstable.slice/",
},
{
input: "/Burstable/pod_123",
expected: "Burstable.slice/Burstable-pod_123.slice/",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
expected: "BestEffort.slice/BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice/",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), true); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}

View File

@ -0,0 +1,80 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"testing"
"k8s.io/api/core/v1"
)
func Test(t *testing.T) {
tests := []struct {
input map[string]string
expected *map[v1.ResourceName]int64
}{
{
input: map[string]string{"memory": ""},
expected: nil,
},
{
input: map[string]string{"memory": "a"},
expected: nil,
},
{
input: map[string]string{"memory": "a%"},
expected: nil,
},
{
input: map[string]string{"memory": "200%"},
expected: nil,
},
{
input: map[string]string{"memory": "0%"},
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 0,
},
},
{
input: map[string]string{"memory": "100%"},
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 100,
},
},
{
// need to change this when CPU is added as a supported resource
input: map[string]string{"memory": "100%", "cpu": "50%"},
expected: nil,
},
}
for _, test := range tests {
actual, err := ParseQOSReserved(test.input)
if actual != nil && test.expected == nil {
t.Errorf("Unexpected success, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if actual == nil && test.expected != nil {
t.Errorf("Unexpected failure, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if (actual == nil && test.expected == nil) || reflect.DeepEqual(*actual, *test.expected) {
continue
}
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
}

View File

@ -0,0 +1,87 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import "fmt"
type unsupportedCgroupManager struct{}
// Make sure that unsupportedCgroupManager implements the CgroupManager interface
var _ CgroupManager = &unsupportedCgroupManager{}
type CgroupSubsystems struct {
Mounts []interface{}
MountPoints map[string]string
}
func NewCgroupManager(_ interface{}) CgroupManager {
return &unsupportedCgroupManager{}
}
func (m *unsupportedCgroupManager) Name(_ CgroupName) string {
return ""
}
func (m *unsupportedCgroupManager) Exists(_ CgroupName) bool {
return false
}
func (m *unsupportedCgroupManager) Destroy(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Update(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
return fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) GetResourceStats(name CgroupName) (*ResourceStats, error) {
return nil, fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
return nil
}
func (m *unsupportedCgroupManager) CgroupName(name string) CgroupName {
return ""
}
func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error {
return nil
}
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
return "", nil
}
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
return ""
}
func RevertFromSystemdToCgroupStyleName(name string) string {
return ""
}
func IsSystemdStyleName(name string) bool {
return false
}

View File

@ -0,0 +1,168 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"time"
"k8s.io/apimachinery/pkg/util/sets"
// TODO: Migrate kubelet to either use its own internal objects or client library.
"k8s.io/api/core/v1"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
"fmt"
"strconv"
"strings"
)
type ActivePodsFunc func() []*v1.Pod
// Manages the containers running on a machine.
type ContainerManager interface {
// Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run.
Start(*v1.Node, ActivePodsFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService) error
// SystemCgroupsLimit returns resources allocated to system cgroups in the machine.
// These cgroups include the system and Kubernetes services.
SystemCgroupsLimit() v1.ResourceList
// GetNodeConfig returns a NodeConfig that is being used by the container manager.
GetNodeConfig() NodeConfig
// Status returns internal Status.
Status() Status
// NewPodContainerManager is a factory method which returns a podContainerManager object
// Returns a noop implementation if qos cgroup hierarchy is not enabled
NewPodContainerManager() PodContainerManager
// GetMountedSubsystems returns the mounted cgroup subsystems on the node
GetMountedSubsystems() *CgroupSubsystems
// GetQOSContainersInfo returns the names of top level QoS containers
GetQOSContainersInfo() QOSContainersInfo
// GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling.
GetNodeAllocatableReservation() v1.ResourceList
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
GetCapacity() v1.ResourceList
// GetDevicePluginResourceCapacity returns the amount of device plugin resources available on the node
// and inactive device plugin resources previously registered on the node.
GetDevicePluginResourceCapacity() (v1.ResourceList, []string)
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
// level QoS containers have their desired state in a thread-safe way
UpdateQOSCgroups() error
// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container.
GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
// UpdatePluginResources calls Allocate of device plugin handler for potential
// requests for device plugin resources, and returns an error if fails.
// Otherwise, it updates allocatableResource in nodeInfo if necessary,
// to make sure it is at least equal to the pod's requested capacity for
// any registered device plugin resource
UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error
InternalContainerLifecycle() InternalContainerLifecycle
}
type NodeConfig struct {
RuntimeCgroupsName string
SystemCgroupsName string
KubeletCgroupsName string
ContainerRuntime string
CgroupsPerQOS bool
CgroupRoot string
CgroupDriver string
KubeletRootDir string
ProtectKernelDefaults bool
NodeAllocatableConfig
ExperimentalQOSReserved map[v1.ResourceName]int64
ExperimentalCPUManagerPolicy string
ExperimentalCPUManagerReconcilePeriod time.Duration
}
type NodeAllocatableConfig struct {
KubeReservedCgroupName string
SystemReservedCgroupName string
EnforceNodeAllocatable sets.String
KubeReserved v1.ResourceList
SystemReserved v1.ResourceList
HardEvictionThresholds []evictionapi.Threshold
}
type Status struct {
// Any soft requirements that were unsatisfied.
SoftRequirements error
}
const (
// Uer visible keys for managing node allocatable enforcement on the node.
NodeAllocatableEnforcementKey = "pods"
SystemReservedEnforcementKey = "system-reserved"
KubeReservedEnforcementKey = "kube-reserved"
)
// containerManager for the kubelet is currently an injected dependency.
// We need to parse the --qos-reserve-requests option in
// cmd/kubelet/app/server.go and there isn't really a good place to put
// the code. If/When the kubelet dependency injection gets worked out,
// maybe there will be a better place for it.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {
return 0, fmt.Errorf("percentage expected, got '%s'", v)
}
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
if err != nil {
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
}
if percentage < 0 || percentage > 100 {
return 0, fmt.Errorf("percentage must be between 0 and 100")
}
return percentage, nil
}
// ParseQOSReserved parses the --qos-reserve-requests option
func ParseQOSReserved(m map[string]string) (*map[v1.ResourceName]int64, error) {
reservations := make(map[v1.ResourceName]int64)
for k, v := range m {
switch v1.ResourceName(k) {
// Only memory resources are supported.
case v1.ResourceMemory:
q, err := parsePercentage(v)
if err != nil {
return nil, err
}
reservations[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return &reservations, nil
}

View File

@ -0,0 +1,892 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
"strings"
"sync"
"time"
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/record"
kubefeatures "k8s.io/kubernetes/pkg/features"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/deviceplugin"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/status"
utilfile "k8s.io/kubernetes/pkg/util/file"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/pkg/util/oom"
"k8s.io/kubernetes/pkg/util/procfs"
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
utilversion "k8s.io/kubernetes/pkg/util/version"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
const (
// The percent of the machine memory capacity. The value is used to calculate
// docker memory resource container's hardlimit to workaround docker memory
// leakage issue. Please see kubernetes/issues/9881 for more detail.
DockerMemoryLimitThresholdPercent = 70
// The minimum memory limit allocated to docker container: 150Mi
MinDockerMemoryLimit = 150 * 1024 * 1024
dockerProcessName = "docker"
dockerPidFile = "/var/run/docker.pid"
containerdProcessName = "docker-containerd"
containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
)
var (
// The docker version in which containerd was introduced.
containerdAPIVersion = utilversion.MustParseGeneric("1.23")
)
// A non-user container tracked by the Kubelet.
type systemContainer struct {
// Absolute name of the container.
name string
// CPU limit in millicores.
cpuMillicores int64
// Function that ensures the state of the container.
// m is the cgroup manager for the specified container.
ensureStateFunc func(m *fs.Manager) error
// Manager for the cgroups of the external container.
manager *fs.Manager
}
func newSystemCgroups(containerName string) *systemContainer {
return &systemContainer{
name: containerName,
manager: createManager(containerName),
}
}
type containerManagerImpl struct {
sync.RWMutex
cadvisorInterface cadvisor.Interface
mountUtil mount.Interface
NodeConfig
status Status
// External containers being managed.
systemContainers []*systemContainer
qosContainers QOSContainersInfo
// Tasks that are run periodically
periodicTasks []func()
// Holds all the mounted cgroup subsystems
subsystems *CgroupSubsystems
nodeInfo *v1.Node
// Interface for cgroup management
cgroupManager CgroupManager
// Capacity of this node.
capacity v1.ResourceList
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
// This path include a top level container for enforcing Node Allocatable.
cgroupRoot string
// Event recorder interface.
recorder record.EventRecorder
// Interface for QoS cgroup management
qosContainerManager QOSContainerManager
// Interface for exporting and allocating devices reported by device plugins.
devicePluginManager deviceplugin.Manager
// Interface for CPU affinity management.
cpuManager cpumanager.Manager
}
type features struct {
cpuHardcapping bool
}
var _ ContainerManager = &containerManagerImpl{}
// checks if the required cgroups subsystems are mounted.
// As of now, only 'cpu' and 'memory' are required.
// cpu quota is a soft requirement.
func validateSystemRequirements(mountUtil mount.Interface) (features, error) {
const (
cgroupMountType = "cgroup"
localErr = "system validation failed"
)
var (
cpuMountPoint string
f features
)
mountPoints, err := mountUtil.List()
if err != nil {
return f, fmt.Errorf("%s - %v", localErr, err)
}
expectedCgroups := sets.NewString("cpu", "cpuacct", "cpuset", "memory")
for _, mountPoint := range mountPoints {
if mountPoint.Type == cgroupMountType {
for _, opt := range mountPoint.Opts {
if expectedCgroups.Has(opt) {
expectedCgroups.Delete(opt)
}
if opt == "cpu" {
cpuMountPoint = mountPoint.Path
}
}
}
}
if expectedCgroups.Len() > 0 {
return f, fmt.Errorf("%s - Following Cgroup subsystem not mounted: %v", localErr, expectedCgroups.List())
}
// Check if cpu quota is available.
// CPU cgroup is required and so it expected to be mounted at this point.
periodExists, err := utilfile.FileExists(path.Join(cpuMountPoint, "cpu.cfs_period_us"))
if err != nil {
glog.Errorf("failed to detect if CPU cgroup cpu.cfs_period_us is available - %v", err)
}
quotaExists, err := utilfile.FileExists(path.Join(cpuMountPoint, "cpu.cfs_quota_us"))
if err != nil {
glog.Errorf("failed to detect if CPU cgroup cpu.cfs_quota_us is available - %v", err)
}
if quotaExists && periodExists {
f.cpuHardcapping = true
}
return f, nil
}
// TODO(vmarmol): Add limits to the system containers.
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
subsystems, err := GetCgroupSubsystems()
if err != nil {
return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
}
// Check whether swap is enabled. The Kubelet does not support running with swap enabled.
swapData, err := ioutil.ReadFile("/proc/swaps")
if err != nil {
return nil, err
}
swapData = bytes.TrimSpace(swapData) // extra trailing \n
swapLines := strings.Split(string(swapData), "\n")
// If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should
// error out unless --fail-swap-on is set to false.
if failSwapOn && len(swapLines) > 1 {
return nil, fmt.Errorf("Running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", swapLines)
}
var capacity = v1.ResourceList{}
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
machineInfo, err := cadvisorInterface.MachineInfo()
if err != nil {
return nil, err
}
capacity = cadvisor.CapacityFromMachineInfo(machineInfo)
cgroupRoot := nodeConfig.CgroupRoot
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
// Check if Cgroup-root actually exists on the node
if nodeConfig.CgroupsPerQOS {
// this does default to / when enabled, but this tests against regressions.
if nodeConfig.CgroupRoot == "" {
return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root")
}
// we need to check that the cgroup root actually exists for each subsystem
// of note, we always use the cgroupfs driver when performing this check since
// the input is provided in that format.
// this is important because we do not want any name conversion to occur.
if !cgroupManager.Exists(CgroupName(cgroupRoot)) {
return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist: %v", cgroupRoot, err)
}
glog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot)
// Include the top level cgroup for enforcing node allocatable into cgroup-root.
// This way, all sub modules can avoid having to understand the concept of node allocatable.
cgroupRoot = path.Join(cgroupRoot, defaultNodeAllocatableCgroupName)
}
glog.Infof("Creating Container Manager object based on Node Config: %+v", nodeConfig)
qosContainerManager, err := NewQOSContainerManager(subsystems, cgroupRoot, nodeConfig, cgroupManager)
if err != nil {
return nil, err
}
cm := &containerManagerImpl{
cadvisorInterface: cadvisorInterface,
mountUtil: mountUtil,
NodeConfig: nodeConfig,
subsystems: subsystems,
cgroupManager: cgroupManager,
capacity: capacity,
cgroupRoot: cgroupRoot,
recorder: recorder,
qosContainerManager: qosContainerManager,
}
glog.Infof("Creating device plugin manager: %t", devicePluginEnabled)
if devicePluginEnabled {
cm.devicePluginManager, err = deviceplugin.NewManagerImpl()
} else {
cm.devicePluginManager, err = deviceplugin.NewManagerStub()
}
if err != nil {
return nil, err
}
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager, err = cpumanager.NewManager(
nodeConfig.ExperimentalCPUManagerPolicy,
nodeConfig.ExperimentalCPUManagerReconcilePeriod,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
)
if err != nil {
glog.Errorf("failed to initialize cpu manager: %v", err)
return nil, err
}
}
return cm, nil
}
// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
if cm.NodeConfig.CgroupsPerQOS {
return &podContainerManagerImpl{
qosContainersInfo: cm.GetQOSContainersInfo(),
subsystems: cm.subsystems,
cgroupManager: cm.cgroupManager,
}
}
return &podContainerManagerNoop{
cgroupRoot: CgroupName(cm.cgroupRoot),
}
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cm.cpuManager}
}
// Create a cgroup container manager.
func createManager(containerName string) *fs.Manager {
allowAllDevices := true
return &fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: &allowAllDevices,
},
},
}
}
type KernelTunableBehavior string
const (
KernelTunableWarn KernelTunableBehavior = "warn"
KernelTunableError KernelTunableBehavior = "error"
KernelTunableModify KernelTunableBehavior = "modify"
)
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables(option KernelTunableBehavior) error {
desiredState := map[string]int{
utilsysctl.VmOvercommitMemory: utilsysctl.VmOvercommitMemoryAlways,
utilsysctl.VmPanicOnOOM: utilsysctl.VmPanicOnOOMInvokeOOMKiller,
utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout,
utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways,
utilsysctl.RootMaxKeys: utilsysctl.RootMaxKeysSetting,
utilsysctl.RootMaxBytes: utilsysctl.RootMaxBytesSetting,
}
sysctl := utilsysctl.New()
errList := []error{}
for flag, expectedValue := range desiredState {
val, err := sysctl.GetSysctl(flag)
if err != nil {
errList = append(errList, err)
continue
}
if val == expectedValue {
continue
}
switch option {
case KernelTunableError:
errList = append(errList, fmt.Errorf("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val))
case KernelTunableWarn:
glog.V(2).Infof("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
case KernelTunableModify:
glog.V(2).Infof("Updating kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
err = sysctl.SetSysctl(flag, expectedValue)
if err != nil {
errList = append(errList, err)
}
}
}
return utilerrors.NewAggregate(errList)
}
func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
f, err := validateSystemRequirements(cm.mountUtil)
if err != nil {
return err
}
if !f.cpuHardcapping {
cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
}
b := KernelTunableModify
if cm.GetNodeConfig().ProtectKernelDefaults {
b = KernelTunableError
}
if err := setupKernelTunables(b); err != nil {
return err
}
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
}
}
// Enforce Node Allocatable (if required)
if err := cm.enforceNodeAllocatableCgroups(); err != nil {
return err
}
systemContainers := []*systemContainer{}
if cm.ContainerRuntime == "docker" {
// With the docker-CRI integration, dockershim will manage the cgroups
// and oom score for the docker processes.
// In the future, NodeSpec should mandate the cgroup that the
// runtime processes need to be in. For now, we still check the
// cgroup for docker periodically, so that kubelet can recognize
// the cgroup for docker and serve stats for the runtime.
// TODO(#27097): Fix this after NodeSpec is clearly defined.
cm.periodicTasks = append(cm.periodicTasks, func() {
glog.V(4).Infof("[ContainerManager]: Adding periodic tasks for docker CRI integration")
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
glog.Error(err)
return
}
glog.V(2).Infof("[ContainerManager]: Discovered runtime cgroups name: %s", cont)
cm.Lock()
defer cm.Unlock()
cm.RuntimeCgroupsName = cont
})
}
if cm.SystemCgroupsName != "" {
if cm.SystemCgroupsName == "/" {
return fmt.Errorf("system container cannot be root (\"/\")")
}
cont := newSystemCgroups(cm.SystemCgroupsName)
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureSystemCgroups("/", manager)
}
systemContainers = append(systemContainers, cont)
}
if cm.KubeletCgroupsName != "" {
cont := newSystemCgroups(cm.KubeletCgroupsName)
allowAllDevices := true
manager := fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: cm.KubeletCgroupsName,
Resources: &configs.Resources{
AllowAllDevices: &allowAllDevices,
},
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Errorf("failed to find cgroups of kubelet - %v", err)
return
}
cm.Lock()
defer cm.Unlock()
cm.KubeletCgroupsName = cont
})
}
cm.systemContainers = systemContainers
return nil
}
func getContainerNameForProcess(name, pidFile string) (string, error) {
pids, err := getPidsForProcess(name, pidFile)
if err != nil {
return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
}
if len(pids) == 0 {
return "", nil
}
cont, err := getContainer(pids[0])
if err != nil {
return "", err
}
return cont, nil
}
func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
cm.RLock()
defer cm.RUnlock()
return cm.NodeConfig
}
func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
return cm.subsystems
}
func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return cm.qosContainerManager.GetQOSContainersInfo()
}
func (cm *containerManagerImpl) UpdateQOSCgroups() error {
return cm.qosContainerManager.UpdateCgroups()
}
func (cm *containerManagerImpl) Status() Status {
cm.RLock()
defer cm.RUnlock()
return cm.status
}
func (cm *containerManagerImpl) Start(node *v1.Node,
activePods ActivePodsFunc,
sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService) error {
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), podStatusProvider, runtimeService)
}
// cache the node Info including resource capacity and
// allocatable of the node
cm.nodeInfo = node
// Ensure that node allocatable configuration is valid.
if err := cm.validateNodeAllocatable(); err != nil {
return err
}
// Setup the node
if err := cm.setupNode(activePods); err != nil {
return err
}
// Don't run a background thread if there are no ensureStateFuncs.
hasEnsureStateFuncs := false
for _, cont := range cm.systemContainers {
if cont.ensureStateFunc != nil {
hasEnsureStateFuncs = true
break
}
}
if hasEnsureStateFuncs {
// Run ensure state functions every minute.
go wait.Until(func() {
for _, cont := range cm.systemContainers {
if cont.ensureStateFunc != nil {
if err := cont.ensureStateFunc(cont.manager); err != nil {
glog.Warningf("[ContainerManager] Failed to ensure state of %q: %v", cont.name, err)
}
}
}
}, time.Minute, wait.NeverStop)
}
if len(cm.periodicTasks) > 0 {
go wait.Until(func() {
for _, task := range cm.periodicTasks {
if task != nil {
task()
}
}
}, 5*time.Minute, wait.NeverStop)
}
// Local storage filesystem information from `RootFsInfo` and `ImagesFsInfo` is available at a later time
// depending on the time when cadvisor manager updates container stats. Therefore use a go routine to keep
// retrieving the information until it is available.
stopChan := make(chan struct{})
go wait.Until(func() {
if err := cm.setFsCapacity(); err != nil {
glog.Errorf("[ContainerManager]: %v", err)
return
}
close(stopChan)
}, time.Second, stopChan)
// Starts device plugin manager.
if err := cm.devicePluginManager.Start(deviceplugin.ActivePodsFunc(activePods), sourcesReady); err != nil {
return err
}
return nil
}
func (cm *containerManagerImpl) setFsCapacity() error {
rootfs, err := cm.cadvisorInterface.RootFsInfo()
if err != nil {
return fmt.Errorf("Fail to get rootfs information %v", err)
}
cm.Lock()
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
cm.Unlock()
return nil
}
// TODO: move the GetResources logic to PodContainerManager.
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{}
// Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here
devOpts := cm.devicePluginManager.GetDeviceRunContainerOptions(pod, container)
if devOpts == nil {
return opts, nil
}
opts.Devices = append(opts.Devices, devOpts.Devices...)
opts.Mounts = append(opts.Mounts, devOpts.Mounts...)
opts.Envs = append(opts.Envs, devOpts.Envs...)
return opts, nil
}
func (cm *containerManagerImpl) UpdatePluginResources(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return cm.devicePluginManager.Allocate(node, attrs)
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
cpuLimit := int64(0)
// Sum up resources of all external containers.
for _, cont := range cm.systemContainers {
cpuLimit += cont.cpuMillicores
}
return v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(
cpuLimit,
resource.DecimalSI),
}
}
func isProcessRunningInHost(pid int) (bool, error) {
// Get init pid namespace.
initPidNs, err := os.Readlink("/proc/1/ns/pid")
if err != nil {
return false, fmt.Errorf("failed to find pid namespace of init process")
}
glog.V(10).Infof("init pid ns is %q", initPidNs)
processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid))
if err != nil {
return false, fmt.Errorf("failed to find pid namespace of process %q", pid)
}
glog.V(10).Infof("Pid %d pid ns is %q", pid, processPidNs)
return initPidNs == processPidNs, nil
}
func getPidFromPidFile(pidFile string) (int, error) {
file, err := os.Open(pidFile)
if err != nil {
return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
}
defer file.Close()
data, err := ioutil.ReadAll(file)
if err != nil {
return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
}
return pid, nil
}
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) == 0 {
return procfs.PidOf(name)
}
pid, err := getPidFromPidFile(pidFile)
if err == nil {
return []int{pid}, nil
}
// Try to lookup pid by process name
pids, err2 := procfs.PidOf(name)
if err2 == nil {
return pids, nil
}
// Return error from getPidFromPidFile since that should have worked
// and is the real source of the problem.
glog.V(4).Infof("unable to get pid from %s: %v", pidFile, err)
return []int{}, err
}
// Ensures that the Docker daemon is in the desired container.
// Temporarily export the function to be used by dockershim.
// TODO(yujuhong): Move this function to dockershim once kubelet migrates to
// dockershim as the default.
func EnsureDockerInContainer(dockerAPIVersion *utilversion.Version, oomScoreAdj int, manager *fs.Manager) error {
type process struct{ name, file string }
dockerProcs := []process{{dockerProcessName, dockerPidFile}}
if dockerAPIVersion.AtLeast(containerdAPIVersion) {
dockerProcs = append(dockerProcs, process{containerdProcessName, containerdPidFile})
}
var errs []error
for _, proc := range dockerProcs {
pids, err := getPidsForProcess(proc.name, proc.file)
if err != nil {
errs = append(errs, fmt.Errorf("failed to get pids for %q: %v", proc.name, err))
continue
}
// Move if the pid is not already in the desired container.
for _, pid := range pids {
if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil {
errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
}
}
}
return utilerrors.NewAggregate(errs)
}
func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error {
if runningInHost, err := isProcessRunningInHost(pid); err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if !runningInHost {
// Process is running inside a container. Don't touch that.
glog.V(2).Infof("pid %d is not running in the host namespaces", pid)
return nil
}
var errs []error
if manager != nil {
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}
if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err))
}
}
}
// Also apply oom-score-adj to processes
oomAdjuster := oom.NewOOMAdjuster()
glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err))
}
return utilerrors.NewAggregate(errs)
}
// getContainer returns the cgroup associated with the specified pid.
// It enforces a unified hierarchy for memory and cpu cgroups.
// On systemd environments, it uses the name=systemd cgroup for the specified pid.
func getContainer(pid int) (string, error) {
cgs, err := cgroups.ParseCgroupFile(fmt.Sprintf("/proc/%d/cgroup", pid))
if err != nil {
return "", err
}
cpu, found := cgs["cpu"]
if !found {
return "", cgroups.NewNotFoundError("cpu")
}
memory, found := cgs["memory"]
if !found {
return "", cgroups.NewNotFoundError("memory")
}
// since we use this container for accounting, we need to ensure its a unified hierarchy.
if cpu != memory {
return "", fmt.Errorf("cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s", cpu, memory)
}
// on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls)
// cpu and memory accounting is off by default, users may choose to enable it per unit or globally.
// users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true).
// users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true
// we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal.
// for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory
// cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers.
// as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet.
// in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally).
if systemd, found := cgs["name=systemd"]; found {
if systemd != cpu {
glog.Warningf("CPUAccounting not enabled for pid: %d", pid)
}
if systemd != memory {
glog.Warningf("MemoryAccounting not enabled for pid: %d", pid)
}
return systemd, nil
}
return cpu, nil
}
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
func ensureSystemCgroups(rootCgroupPath string, manager *fs.Manager) error {
// Move non-kernel PIDs to the system container.
attemptsRemaining := 10
var errs []error
for attemptsRemaining >= 0 {
// Only keep errors on latest attempt.
errs = []error{}
attemptsRemaining--
allPids, err := cmutil.GetPids(rootCgroupPath)
if err != nil {
errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err))
continue
}
// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
pids := make([]int, 0, len(allPids))
for _, pid := range allPids {
if pid == 1 || isKernelPid(pid) {
continue
}
pids = append(pids, pid)
}
glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))
// Check if we have moved all the non-kernel PIDs.
if len(pids) == 0 {
break
}
glog.Infof("Moving non-kernel processes: %v", pids)
for _, pid := range pids {
err := manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err))
}
}
}
if attemptsRemaining < 0 {
errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name))
}
return utilerrors.NewAggregate(errs)
}
// Determines whether the specified PID is a kernel PID.
func isKernelPid(pid int) bool {
// Kernel threads have no associated executable.
_, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid))
return err != nil
}
// Helper for getting the docker API version.
func getDockerAPIVersion(cadvisor cadvisor.Interface) *utilversion.Version {
versions, err := cadvisor.VersionInfo()
if err != nil {
glog.Errorf("Error requesting cAdvisor VersionInfo: %v", err)
return utilversion.MustParseSemantic("0.0")
}
dockerAPIVersion, err := utilversion.ParseGeneric(versions.DockerAPIVersion)
if err != nil {
glog.Errorf("Error parsing docker version %q: %v", versions.DockerVersion, err)
return utilversion.MustParseSemantic("0.0")
}
return dockerAPIVersion
}
func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
cm.RLock()
defer cm.RUnlock()
return cm.capacity
}
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, []string) {
return cm.devicePluginManager.GetCapacity()
}

View File

@ -0,0 +1,209 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"io/ioutil"
"os"
"path"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/kubernetes/pkg/util/mount"
)
type fakeMountInterface struct {
mountPoints []mount.MountPoint
}
func (mi *fakeMountInterface) Mount(source string, target string, fstype string, options []string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) Unmount(target string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) List() ([]mount.MountPoint, error) {
return mi.mountPoints, nil
}
func (mi *fakeMountInterface) IsMountPointMatch(mp mount.MountPoint, dir string) bool {
return (mp.Path == dir)
}
func (mi *fakeMountInterface) IsNotMountPoint(dir string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) IsLikelyNotMountPoint(file string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) GetDeviceNameFromMount(mountPath, pluginDir string) (string, error) {
return "", nil
}
func (mi *fakeMountInterface) DeviceOpened(pathname string) (bool, error) {
for _, mp := range mi.mountPoints {
if mp.Device == pathname {
return true, nil
}
}
return false, nil
}
func (mi *fakeMountInterface) PathIsDevice(pathname string) (bool, error) {
return true, nil
}
func (mi *fakeMountInterface) MakeRShared(path string) error {
return nil
}
func (mi *fakeMountInterface) GetFileType(pathname string) (mount.FileType, error) {
return mount.FileType("fake"), nil
}
func (mi *fakeMountInterface) MakeDir(pathname string) error {
return nil
}
func (mi *fakeMountInterface) MakeFile(pathname string) error {
return nil
}
func (mi *fakeMountInterface) ExistsPath(pathname string) bool {
return true
}
func fakeContainerMgrMountInt() mount.Interface {
return &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "memory"},
},
},
}
}
func TestCgroupMountValidationSuccess(t *testing.T) {
f, err := validateSystemRequirements(fakeContainerMgrMountInt())
assert.Nil(t, err)
assert.False(t, f.cpuHardcapping, "cpu hardcapping is expected to be disabled")
}
func TestCgroupMountValidationMemoryMissing(t *testing.T) {
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
},
}
_, err := validateSystemRequirements(mountInt)
assert.Error(t, err)
}
func TestCgroupMountValidationMultipleSubsystem(t *testing.T) {
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset", "memory"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
},
}
_, err := validateSystemRequirements(mountInt)
assert.Nil(t, err)
}
func TestSoftRequirementsValidationSuccess(t *testing.T) {
req := require.New(t)
tempDir, err := ioutil.TempDir("", "")
req.NoError(err)
defer os.RemoveAll(tempDir)
req.NoError(ioutil.WriteFile(path.Join(tempDir, "cpu.cfs_period_us"), []byte("0"), os.ModePerm))
req.NoError(ioutil.WriteFile(path.Join(tempDir, "cpu.cfs_quota_us"), []byte("0"), os.ModePerm))
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
Path: tempDir,
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct", "memory"},
},
},
}
f, err := validateSystemRequirements(mountInt)
assert.NoError(t, err)
assert.True(t, f.cpuHardcapping, "cpu hardcapping is expected to be enabled")
}

View File

@ -0,0 +1,95 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
type containerManagerStub struct{}
var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
glog.V(2).Infof("Starting stub container manager")
return nil
}
func (cm *containerManagerStub) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerStub) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (cm *containerManagerStub) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerStub) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerStub) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerStub) Status() Status {
return Status{}
}
func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) GetCapacity() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, []string) {
return nil, []string{}
}
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerStub) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *containerManagerStub) UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{}
}

View File

@ -0,0 +1,100 @@
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
type unsupportedContainerManager struct {
}
var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
return fmt.Errorf("Container Manager is unsupported in this build")
}
func (unsupportedContainerManager) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (unsupportedContainerManager) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (unsupportedContainerManager) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (unsupportedContainerManager) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (unsupportedContainerManager) UpdateQOSCgroups() error {
return nil
}
func (cm *unsupportedContainerManager) Status() Status {
return Status{}
}
func (cm *unsupportedContainerManager) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *unsupportedContainerManager) GetCapacity() v1.ResourceList {
return nil
}
func (cm *unsupportedContainerManager) GetDevicePluginResourceCapacity() (v1.ResourceList, []string) {
return nil, []string{}
}
func (cm *unsupportedContainerManager) NewPodContainerManager() PodContainerManager {
return &unsupportedPodContainerManager{}
}
func (cm *unsupportedContainerManager) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *unsupportedContainerManager) UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *unsupportedContainerManager) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}
func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
return &unsupportedContainerManager{}, nil
}

View File

@ -0,0 +1,116 @@
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"k8s.io/kubernetes/pkg/util/mount"
)
type fakeMountInterface struct {
mountPoints []mount.MountPoint
}
func (mi *fakeMountInterface) Mount(source string, target string, fstype string, options []string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) Unmount(target string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) List() ([]mount.MountPoint, error) {
return mi.mountPoints, nil
}
func (mi *fakeMountInterface) IsMountPointMatch(mp mount.MountPoint, dir string) bool {
return (mp.Path == dir)
}
func (mi *fakeMountInterface) IsNotMountPoint(dir string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) IsLikelyNotMountPoint(file string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) GetDeviceNameFromMount(mountPath, pluginDir string) (string, error) {
return "", nil
}
func (mi *fakeMountInterface) DeviceOpened(pathname string) (bool, error) {
for _, mp := range mi.mountPoints {
if mp.Device == pathname {
return true, nil
}
}
return false, nil
}
func (mi *fakeMountInterface) PathIsDevice(pathname string) (bool, error) {
return true, nil
}
func (mi *fakeMountInterface) MakeRShared(path string) error {
return nil
}
func (mi *fakeMountInterface) GetFileType(pathname string) (mount.FileType, error) {
return mount.FileType("fake"), nil
}
func (mi *fakeMountInterface) MakeDir(pathname string) error {
return nil
}
func (mi *fakeMountInterface) MakeFile(pathname string) error {
return nil
}
func (mi *fakeMountInterface) ExistsPath(pathname string) bool {
return true
}
func fakeContainerMgrMountInt() mount.Interface {
return &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "memory"},
},
},
}
}

View File

@ -0,0 +1,46 @@
// +build windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/util/mount"
)
type containerManagerImpl struct {
containerManagerStub
}
var _ ContainerManager = &containerManagerImpl{}
func (cm *containerManagerImpl) Start(_ *v1.Node, _ ActivePodsFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
glog.V(2).Infof("Starting Windows stub container manager")
return nil
}
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
return &containerManagerImpl{}, nil
}

View File

@ -0,0 +1,71 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"cpu_assignment.go",
"cpu_manager.go",
"fake_cpu_manager.go",
"policy.go",
"policy_none.go",
"policy_static.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager",
visibility = ["//visibility:public"],
deps = [
"//pkg/apis/core/v1/helper/qos:go_default_library",
"//pkg/kubelet/apis/cri/v1alpha1/runtime:go_default_library",
"//pkg/kubelet/cm/cpumanager/state:go_default_library",
"//pkg/kubelet/cm/cpumanager/topology:go_default_library",
"//pkg/kubelet/cm/cpuset:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = [
"cpu_assignment_test.go",
"cpu_manager_test.go",
"policy_none_test.go",
"policy_static_test.go",
"policy_test.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager",
library = ":go_default_library",
deps = [
"//pkg/kubelet/apis/cri/v1alpha1/runtime:go_default_library",
"//pkg/kubelet/cm/cpumanager/state:go_default_library",
"//pkg/kubelet/cm/cpumanager/topology:go_default_library",
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/cm/cpumanager/state:all-srcs",
"//pkg/kubelet/cm/cpumanager/topology:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,6 @@
approvers:
- derekwaynecarr
- vishh
- ConnorDoyle
- sjenning
- balajismaniam

View File

@ -0,0 +1,197 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"sort"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type cpuAccumulator struct {
topo *topology.CPUTopology
details topology.CPUDetails
numCPUsNeeded int
result cpuset.CPUSet
}
func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
return &cpuAccumulator{
topo: topo,
details: topo.CPUDetails.KeepOnly(availableCPUs),
numCPUsNeeded: numCPUs,
result: cpuset.NewCPUSet(),
}
}
func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
a.result = a.result.Union(cpus)
a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
a.numCPUsNeeded -= cpus.Size()
}
// Returns true if the supplied socket is fully available in `topoDetails`.
func (a *cpuAccumulator) isSocketFree(socketID int) bool {
return a.details.CPUsInSocket(socketID).Size() == a.topo.CPUsPerSocket()
}
// Returns true if the supplied core is fully available in `topoDetails`.
func (a *cpuAccumulator) isCoreFree(coreID int) bool {
return a.details.CPUsInCore(coreID).Size() == a.topo.CPUsPerCore()
}
// Returns free socket IDs as a slice sorted by:
// - socket ID, ascending.
func (a *cpuAccumulator) freeSockets() []int {
return a.details.Sockets().Filter(a.isSocketFree).ToSlice()
}
// Returns core IDs as a slice sorted by:
// - the number of whole available cores on the socket, ascending
// - socket ID, ascending
// - core ID, ascending
func (a *cpuAccumulator) freeCores() []int {
socketIDs := a.details.Sockets().ToSlice()
sort.Slice(socketIDs,
func(i, j int) bool {
iCores := a.details.CoresInSocket(socketIDs[i]).Filter(a.isCoreFree)
jCores := a.details.CoresInSocket(socketIDs[j]).Filter(a.isCoreFree)
return iCores.Size() < jCores.Size() || socketIDs[i] < socketIDs[j]
})
coreIDs := []int{}
for _, s := range socketIDs {
coreIDs = append(coreIDs, a.details.CoresInSocket(s).Filter(a.isCoreFree).ToSlice()...)
}
return coreIDs
}
// Returns CPU IDs as a slice sorted by:
// - socket affinity with result
// - number of CPUs available on the same sockett
// - number of CPUs available on the same core
// - socket ID.
// - core ID.
func (a *cpuAccumulator) freeCPUs() []int {
result := []int{}
cores := a.details.Cores().ToSlice()
sort.Slice(
cores,
func(i, j int) bool {
iCore := cores[i]
jCore := cores[j]
iCPUs := a.topo.CPUDetails.CPUsInCore(iCore).ToSlice()
jCPUs := a.topo.CPUDetails.CPUsInCore(jCore).ToSlice()
iSocket := a.topo.CPUDetails[iCPUs[0]].SocketID
jSocket := a.topo.CPUDetails[jCPUs[0]].SocketID
// Compute the number of CPUs in the result reside on the same socket
// as each core.
iSocketColoScore := a.topo.CPUDetails.CPUsInSocket(iSocket).Intersection(a.result).Size()
jSocketColoScore := a.topo.CPUDetails.CPUsInSocket(jSocket).Intersection(a.result).Size()
// Compute the number of available CPUs available on the same socket
// as each core.
iSocketFreeScore := a.details.CPUsInSocket(iSocket).Size()
jSocketFreeScore := a.details.CPUsInSocket(jSocket).Size()
// Compute the number of available CPUs on each core.
iCoreFreeScore := a.details.CPUsInCore(iCore).Size()
jCoreFreeScore := a.details.CPUsInCore(jCore).Size()
return iSocketColoScore > jSocketColoScore ||
iSocketFreeScore < jSocketFreeScore ||
iCoreFreeScore < jCoreFreeScore ||
iSocket < jSocket ||
iCore < jCore
})
// For each core, append sorted CPU IDs to result.
for _, core := range cores {
result = append(result, a.details.CPUsInCore(core).ToSlice()...)
}
return result
}
func (a *cpuAccumulator) needs(n int) bool {
return a.numCPUsNeeded >= n
}
func (a *cpuAccumulator) isSatisfied() bool {
return a.numCPUsNeeded < 1
}
func (a *cpuAccumulator) isFailed() bool {
return a.numCPUsNeeded > a.details.CPUs().Size()
}
func takeByTopology(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
if acc.isSatisfied() {
return acc.result, nil
}
if acc.isFailed() {
return cpuset.NewCPUSet(), fmt.Errorf("not enough cpus available to satisfy request")
}
// Algorithm: topology-aware best-fit
// 1. Acquire whole sockets, if available and the container requires at
// least a socket's-worth of CPUs.
for _, s := range acc.freeSockets() {
if acc.needs(acc.topo.CPUsPerSocket()) {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming socket [%d]", s)
acc.take(acc.details.CPUsInSocket(s))
if acc.isSatisfied() {
return acc.result, nil
}
}
}
// 2. Acquire whole cores, if available and the container requires at least
// a core's-worth of CPUs.
for _, c := range acc.freeCores() {
if acc.needs(acc.topo.CPUsPerCore()) {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming core [%d]", c)
acc.take(acc.details.CPUsInCore(c))
if acc.isSatisfied() {
return acc.result, nil
}
}
}
// 3. Acquire single threads, preferring to fill partially-allocated cores
// on the same sockets as the whole cores we have already taken in this
// allocation.
for _, c := range acc.freeCPUs() {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming CPU [%d]", c)
if acc.needs(1) {
acc.take(cpuset.NewCPUSet(c))
}
if acc.isSatisfied() {
return acc.result, nil
}
}
return cpuset.NewCPUSet(), fmt.Errorf("failed to allocate cpus")
}

View File

@ -0,0 +1,385 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"reflect"
"testing"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
func TestCPUAccumulatorFreeSockets(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 1 socket free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0},
},
{
"single socket HT, 0 sockets free",
topoSingleSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
[]int{},
},
{
"dual socket HT, 2 sockets free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 1},
},
{
"dual socket HT, 1 socket free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11),
[]int{1},
},
{
"dual socket HT, 0 sockets free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 2, 3, 4, 5, 6, 7, 8, 9, 11),
[]int{},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeSockets()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorFreeCores(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 4 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0, 1, 2, 3},
},
{
"single socket HT, 3 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 4, 5, 6),
[]int{0, 1, 2},
},
{
"single socket HT, 3 cores free (1 partially consumed)",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6),
[]int{0, 1, 2},
},
{
"single socket HT, 0 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(),
[]int{},
},
{
"single socket HT, 0 cores free (4 partially consumed)",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3),
[]int{},
},
{
"dual socket HT, 6 cores free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 2, 4, 1, 3, 5},
},
{
"dual socket HT, 5 cores free (1 consumed from socket 0)",
topoDualSocketHT,
cpuset.NewCPUSet(2, 1, 3, 4, 5, 7, 8, 9, 10, 11),
[]int{2, 4, 1, 3, 5},
},
{
"dual socket HT, 4 cores free (1 consumed from each socket)",
topoDualSocketHT,
cpuset.NewCPUSet(2, 3, 4, 5, 8, 9, 10, 11),
[]int{2, 4, 3, 5},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeCores()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorFreeCPUs(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 8 cpus free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0, 4, 1, 5, 2, 6, 3, 7},
},
{
"single socket HT, 5 cpus free",
topoSingleSocketHT,
cpuset.NewCPUSet(3, 4, 5, 6, 7),
[]int{4, 5, 6, 3, 7},
},
{
"dual socket HT, 12 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
{
"dual socket HT, 11 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
{
"dual socket HT, 10 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
[]int{2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeCPUs()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorTake(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
takeCPUs []cpuset.CPUSet
numCPUs int
expectSatisfied bool
expectFailed bool
}{
{
"take 0 cpus from a single socket HT, require 1",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet()},
1,
false,
false,
},
{
"take 0 cpus from a single socket HT, require 1, none available",
topoSingleSocketHT,
cpuset.NewCPUSet(),
[]cpuset.CPUSet{cpuset.NewCPUSet()},
1,
false,
true,
},
{
"take 1 cpu from a single socket HT, require 1",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet(0)},
1,
true,
false,
},
{
"take 1 cpu from a single socket HT, require 2",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet(0)},
2,
false,
false,
},
{
"take 2 cpu from a single socket HT, require 4, expect failed",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2),
[]cpuset.CPUSet{cpuset.NewCPUSet(0), cpuset.NewCPUSet(1)},
4,
false,
true,
},
{
"take all cpus one at a time from a single socket HT, require 8",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{
cpuset.NewCPUSet(0),
cpuset.NewCPUSet(1),
cpuset.NewCPUSet(2),
cpuset.NewCPUSet(3),
cpuset.NewCPUSet(4),
cpuset.NewCPUSet(5),
cpuset.NewCPUSet(6),
cpuset.NewCPUSet(7),
},
8,
true,
false,
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, tc.numCPUs)
totalTaken := 0
for _, cpus := range tc.takeCPUs {
acc.take(cpus)
totalTaken += cpus.Size()
}
if tc.expectSatisfied != acc.isSatisfied() {
t.Errorf("[%s] expected acc.isSatisfied() to be %t", tc.description, tc.expectSatisfied)
}
if tc.expectFailed != acc.isFailed() {
t.Errorf("[%s] expected acc.isFailed() to be %t", tc.description, tc.expectFailed)
}
for _, cpus := range tc.takeCPUs {
availableCPUs := acc.details.CPUs()
if cpus.Intersection(availableCPUs).Size() > 0 {
t.Errorf("[%s] expected intersection of taken cpus [%s] and acc.details.CPUs() [%s] to be empty", tc.description, cpus, availableCPUs)
}
if !cpus.IsSubsetOf(acc.result) {
t.Errorf("[%s] expected [%s] to be a subset of acc.result [%s]", tc.description, cpus, acc.result)
}
}
expNumCPUsNeeded := tc.numCPUs - totalTaken
if acc.numCPUsNeeded != expNumCPUsNeeded {
t.Errorf("[%s] expected acc.numCPUsNeeded to be %d (got %d)", tc.description, expNumCPUsNeeded, acc.numCPUsNeeded)
}
}
}
func TestTakeByTopology(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
numCPUs int
expErr string
expResult cpuset.CPUSet
}{
{
"take more cpus than are available from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 2, 4, 6),
5,
"not enough cpus available to satisfy request",
cpuset.NewCPUSet(),
},
{
"take zero cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
0,
"",
cpuset.NewCPUSet(),
},
{
"take one cpu from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
1,
"",
cpuset.NewCPUSet(0),
},
{
"take one cpu from single socket with HT, some cpus are taken",
topoSingleSocketHT,
cpuset.NewCPUSet(1, 3, 5, 6, 7),
1,
"",
cpuset.NewCPUSet(6),
},
{
"take two cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
2,
"",
cpuset.NewCPUSet(0, 4),
},
{
"take all cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
8,
"",
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
},
{
"take two cpus from single socket with HT, only one core totally free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 6),
2,
"",
cpuset.NewCPUSet(2, 6),
},
{
"take three cpus from dual socket with HT - core from Socket 0",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
1,
"",
cpuset.NewCPUSet(2),
},
{
"take a socket of cpus from dual socket with HT",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
6,
"",
cpuset.NewCPUSet(0, 2, 4, 6, 8, 10),
},
}
for _, tc := range testCases {
result, err := takeByTopology(tc.topo, tc.availableCPUs, tc.numCPUs)
if tc.expErr != "" && err.Error() != tc.expErr {
t.Errorf("expected error to be [%v] but it was [%v] in test \"%s\"", tc.expErr, err, tc.description)
}
if !result.Equals(tc.expResult) {
t.Errorf("expected result [%s] to equal [%s] in test \"%s\"", result, tc.expResult, tc.description)
}
}
}

View File

@ -0,0 +1,290 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"math"
"sync"
"time"
"github.com/golang/glog"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
"path"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(id string, resources *runtimeapi.LinuxContainerResources) error
}
type policyName string
// CPUManagerStateFileName is the name file name where cpu manager stores it's state
const CPUManagerStateFileName = "cpu_manager_state"
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService)
// AddContainer is called between container create and container start
// so that initial CPU affinity settings can be written through to the
// container runtime before the first process begins to execute.
AddContainer(p *v1.Pod, c *v1.Container, containerID string) error
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, the CPU manager stops trying to reconcile
// that container and any CPUs dedicated to the container are freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
}
type manager struct {
sync.Mutex
policy Policy
// reconcilePeriod is the duration between calls to reconcileState.
reconcilePeriod time.Duration
// state allows pluggable CPU assignment policies while sharing a common
// representation of state for the system to inspect and reconcile.
state state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated in the reconciliation loop.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
machineInfo *cadvisorapi.MachineInfo
nodeAllocatableReservation v1.ResourceList
}
var _ Manager = &manager{}
// NewManager creates new cpu manager based on provided policy
func NewManager(
cpuPolicyName string,
reconcilePeriod time.Duration,
machineInfo *cadvisorapi.MachineInfo,
nodeAllocatableReservation v1.ResourceList,
stateFileDirecory string,
) (Manager, error) {
var policy Policy
switch policyName(cpuPolicyName) {
case PolicyNone:
policy = NewNonePolicy()
case PolicyStatic:
topo, err := topology.Discover(machineInfo)
if err != nil {
return nil, err
}
glog.Infof("[cpumanager] detected CPU topology: %v", topo)
reservedCPUs, ok := nodeAllocatableReservation[v1.ResourceCPU]
if !ok {
// The static policy cannot initialize without this information. Panic!
panic("[cpumanager] unable to determine reserved CPU resources for static policy")
}
if reservedCPUs.IsZero() {
// Panic!
//
// The static policy requires this to be nonzero. Zero CPU reservation
// would allow the shared pool to be completely exhausted. At that point
// either we would violate our guarantee of exclusivity or need to evict
// any pod that has at least one container that requires zero CPUs.
// See the comments in policy_static.go for more details.
panic("[cpumanager] the static policy requires systemreserved.cpu + kubereserved.cpu to be greater than zero")
}
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
policy = NewStaticPolicy(topo, numReservedCPUs)
default:
glog.Errorf("[cpumanager] Unknown policy \"%s\", falling back to default policy \"%s\"", cpuPolicyName, PolicyNone)
policy = NewNonePolicy()
}
stateImpl := state.NewFileState(
path.Join(stateFileDirecory, CPUManagerStateFileName),
policy.Name())
manager := &manager{
policy: policy,
reconcilePeriod: reconcilePeriod,
state: stateImpl,
machineInfo: machineInfo,
nodeAllocatableReservation: nodeAllocatableReservation,
}
return manager, nil
}
func (m *manager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) {
glog.Infof("[cpumanger] starting with %s policy", m.policy.Name())
glog.Infof("[cpumanger] reconciling every %v", m.reconcilePeriod)
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.policy.Start(m.state)
if m.policy.Name() == string(PolicyNone) {
return
}
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
}
func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) error {
m.Lock()
err := m.policy.AddContainer(m.state, p, c, containerID)
if err != nil {
glog.Errorf("[cpumanager] AddContainer error: %v", err)
m.Unlock()
return err
}
cpus := m.state.GetCPUSetOrDefault(containerID)
m.Unlock()
if !cpus.IsEmpty() {
err = m.updateContainerCPUSet(containerID, cpus)
if err != nil {
glog.Errorf("[cpumanager] AddContainer error: %v", err)
return err
}
} else {
glog.V(5).Infof("[cpumanager] update container resources is skipped due to cpu set is empty")
}
return nil
}
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
err := m.policy.RemoveContainer(m.state, containerID)
if err != nil {
glog.Errorf("[cpumanager] RemoveContainer error: %v", err)
return err
}
return nil
}
func (m *manager) State() state.Reader {
return m.state
}
type reconciledContainer struct {
podName string
containerName string
containerID string
}
func (m *manager) reconcileState() (success []reconciledContainer, failure []reconciledContainer) {
success = []reconciledContainer{}
failure = []reconciledContainer{}
for _, pod := range m.activePods() {
allContainers := pod.Spec.InitContainers
allContainers = append(allContainers, pod.Spec.Containers...)
for _, container := range allContainers {
status, ok := m.podStatusProvider.GetPodStatus(pod.UID)
if !ok {
glog.Warningf("[cpumanager] reconcileState: skipping pod; status not found (pod: %s, container: %s)", pod.Name, container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
break
}
containerID, err := findContainerIDByName(&status, container.Name)
if err != nil {
glog.Warningf("[cpumanager] reconcileState: skipping container; ID not found in status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
cset := m.state.GetCPUSetOrDefault(containerID)
if cset.IsEmpty() {
// NOTE: This should not happen outside of tests.
glog.Infof("[cpumanager] reconcileState: skipping container; assigned cpuset is empty (pod: %s, container: %s)", pod.Name, container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
glog.V(4).Infof("[cpumanager] reconcileState: updating container (pod: %s, container: %s, container id: %s, cpuset: \"%v\")", pod.Name, container.Name, containerID, cset)
err = m.updateContainerCPUSet(containerID, cset)
if err != nil {
glog.Errorf("[cpumanager] reconcileState: failed to update container (pod: %s, container: %s, container id: %s, cpuset: \"%v\", error: %v)", pod.Name, container.Name, containerID, cset, err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
success = append(success, reconciledContainer{pod.Name, container.Name, containerID})
}
}
return success, failure
}
func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
for _, container := range status.ContainerStatuses {
if container.Name == name && container.ContainerID != "" {
cid := &kubecontainer.ContainerID{}
err := cid.ParseString(container.ContainerID)
if err != nil {
return "", err
}
return cid.ID, nil
}
}
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
}
func (m *manager) updateContainerCPUSet(containerID string, cpus cpuset.CPUSet) error {
// TODO: Consider adding a `ResourceConfigForContainer` helper in
// helpers_linux.go similar to what exists for pods.
// It would be better to pass the full container resources here instead of
// this patch-like partial resources.
return m.containerRuntime.UpdateContainerResources(
containerID,
&runtimeapi.LinuxContainerResources{
CpusetCpus: cpus.String(),
})
}

View File

@ -0,0 +1,591 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"reflect"
"strings"
"testing"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
"io/ioutil"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"os"
)
type mockState struct {
assignments state.ContainerCPUAssignments
defaultCPUSet cpuset.CPUSet
}
func (s *mockState) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
res, ok := s.assignments[containerID]
return res.Clone(), ok
}
func (s *mockState) GetDefaultCPUSet() cpuset.CPUSet {
return s.defaultCPUSet.Clone()
}
func (s *mockState) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
if res, ok := s.GetCPUSet(containerID); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *mockState) SetCPUSet(containerID string, cset cpuset.CPUSet) {
s.assignments[containerID] = cset
}
func (s *mockState) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.defaultCPUSet = cset
}
func (s *mockState) Delete(containerID string) {
delete(s.assignments, containerID)
}
func (s *mockState) ClearState() {
s.defaultCPUSet = cpuset.CPUSet{}
s.assignments = make(state.ContainerCPUAssignments)
}
func (s *mockState) SetCPUAssignments(a state.ContainerCPUAssignments) {
s.assignments = a.Clone()
}
func (s *mockState) GetCPUAssignments() state.ContainerCPUAssignments {
return s.assignments.Clone()
}
type mockPolicy struct {
err error
}
func (p *mockPolicy) Name() string {
return "mock"
}
func (p *mockPolicy) Start(s state.State) {
}
func (p *mockPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
return p.err
}
func (p *mockPolicy) RemoveContainer(s state.State, containerID string) error {
return p.err
}
type mockRuntimeService struct {
err error
}
func (rt mockRuntimeService) UpdateContainerResources(id string, resources *runtimeapi.LinuxContainerResources) error {
return rt.err
}
type mockPodStatusProvider struct {
podStatus v1.PodStatus
found bool
}
func (psp mockPodStatusProvider) GetPodStatus(uid types.UID) (v1.PodStatus, bool) {
return psp.podStatus, psp.found
}
type mockPodKiller struct {
killedPods []*v1.Pod
}
func (f *mockPodKiller) killPodNow(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error {
f.killedPods = append(f.killedPods, pod)
return nil
}
type mockPodProvider struct {
pods []*v1.Pod
}
func (f *mockPodProvider) getPods() []*v1.Pod {
return f.pods
}
type mockRecorder struct{}
func (r *mockRecorder) Eventf(object runtime.Object, eventtype, reason, messageFmt string, args ...interface{}) {
}
func makePod(cpuRequest, cpuLimit string) *v1.Pod {
return &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse(cpuRequest),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("1G"),
},
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse(cpuLimit),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("1G"),
},
},
},
},
},
}
}
// CpuAllocatable must be <= CpuCapacity
func prepareCPUNodeStatus(CPUCapacity, CPUAllocatable string) v1.NodeStatus {
nodestatus := v1.NodeStatus{
Capacity: make(v1.ResourceList, 1),
Allocatable: make(v1.ResourceList, 1),
}
cpucap, _ := resource.ParseQuantity(CPUCapacity)
cpuall, _ := resource.ParseQuantity(CPUAllocatable)
nodestatus.Capacity[v1.ResourceCPU] = cpucap
nodestatus.Allocatable[v1.ResourceCPU] = cpuall
return nodestatus
}
func TestCPUManagerAdd(t *testing.T) {
testCases := []struct {
description string
regErr error
updateErr error
expErr error
}{
{
description: "cpu manager add - no error",
regErr: nil,
updateErr: nil,
expErr: nil,
},
{
description: "cpu manager add - policy add container error",
regErr: fmt.Errorf("fake reg error"),
updateErr: nil,
expErr: fmt.Errorf("fake reg error"),
},
{
description: "cpu manager add - container update error",
regErr: nil,
updateErr: fmt.Errorf("fake update error"),
expErr: nil,
},
}
for _, testCase := range testCases {
mgr := &manager{
policy: &mockPolicy{
err: testCase.regErr,
},
state: &mockState{
assignments: state.ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
containerRuntime: mockRuntimeService{
err: testCase.updateErr,
},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
pod := makePod("1000", "1000")
container := &pod.Spec.Containers[0]
err := mgr.AddContainer(pod, container, "fakeID")
if !reflect.DeepEqual(err, testCase.expErr) {
t.Errorf("CPU Manager AddContainer() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expErr, err)
}
}
}
func TestCPUManagerGenerate(t *testing.T) {
testCases := []struct {
description string
cpuPolicyName string
nodeAllocatableReservation v1.ResourceList
isTopologyBroken bool
panicMsg string
expectedPolicy string
expectedError error
skipIfPermissionsError bool
}{
{
description: "set none policy",
cpuPolicyName: "none",
nodeAllocatableReservation: nil,
expectedPolicy: "none",
},
{
description: "invalid policy name",
cpuPolicyName: "invalid",
nodeAllocatableReservation: nil,
expectedPolicy: "none",
},
{
description: "static policy",
cpuPolicyName: "static",
nodeAllocatableReservation: v1.ResourceList{v1.ResourceCPU: *resource.NewQuantity(3, resource.DecimalSI)},
expectedPolicy: "static",
skipIfPermissionsError: true,
},
{
description: "static policy - broken topology",
cpuPolicyName: "static",
nodeAllocatableReservation: v1.ResourceList{},
isTopologyBroken: true,
expectedError: fmt.Errorf("could not detect number of cpus"),
skipIfPermissionsError: true,
},
{
description: "static policy - broken reservation",
cpuPolicyName: "static",
nodeAllocatableReservation: v1.ResourceList{},
panicMsg: "unable to determine reserved CPU resources for static policy",
skipIfPermissionsError: true,
},
{
description: "static policy - no CPU resources",
cpuPolicyName: "static",
nodeAllocatableReservation: v1.ResourceList{v1.ResourceCPU: *resource.NewQuantity(0, resource.DecimalSI)},
panicMsg: "the static policy requires systemreserved.cpu + kubereserved.cpu to be greater than zero",
skipIfPermissionsError: true,
},
}
mockedMachineInfo := cadvisorapi.MachineInfo{
NumCores: 4,
Topology: []cadvisorapi.Node{
{
Cores: []cadvisorapi.Core{
{
Id: 0,
Threads: []int{0},
},
{
Id: 1,
Threads: []int{1},
},
{
Id: 2,
Threads: []int{2},
},
{
Id: 3,
Threads: []int{3},
},
},
},
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
machineInfo := &mockedMachineInfo
if testCase.isTopologyBroken {
machineInfo = &cadvisorapi.MachineInfo{}
}
sDir, err := ioutil.TempDir("/tmp/", "cpu_manager_test")
if err != nil {
t.Errorf("cannot create state file: %s", err.Error())
}
defer os.RemoveAll(sDir)
defer func() {
if err := recover(); err != nil {
if testCase.panicMsg != "" {
if !strings.Contains(err.(string), testCase.panicMsg) {
t.Errorf("Unexpected panic message. Have: %q wants %q", err, testCase.panicMsg)
}
} else {
t.Errorf("Unexpected panic: %q", err)
}
} else if testCase.panicMsg != "" {
t.Error("Expected panic hasn't been raised")
}
}()
mgr, err := NewManager(testCase.cpuPolicyName, 5*time.Second, machineInfo, testCase.nodeAllocatableReservation, sDir)
if testCase.expectedError != nil {
if !strings.Contains(err.Error(), testCase.expectedError.Error()) {
t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error())
}
} else {
rawMgr := mgr.(*manager)
if rawMgr.policy.Name() != testCase.expectedPolicy {
t.Errorf("Unexpected policy name. Have: %q wants %q", rawMgr.policy.Name(), testCase.expectedPolicy)
}
}
})
}
}
func TestCPUManagerRemove(t *testing.T) {
mgr := &manager{
policy: &mockPolicy{
err: nil,
},
state: &mockState{
assignments: state.ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
containerRuntime: mockRuntimeService{},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
err := mgr.RemoveContainer("fakeID")
if err != nil {
t.Errorf("CPU Manager RemoveContainer() error. expected error to be nil but got: %v", err)
}
mgr = &manager{
policy: &mockPolicy{
err: fmt.Errorf("fake error"),
},
state: state.NewMemoryState(),
containerRuntime: mockRuntimeService{},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
err = mgr.RemoveContainer("fakeID")
if !reflect.DeepEqual(err, fmt.Errorf("fake error")) {
t.Errorf("CPU Manager RemoveContainer() error. expected error: fake error but got: %v", err)
}
}
func TestReconcileState(t *testing.T) {
testCases := []struct {
description string
activePods []*v1.Pod
pspPS v1.PodStatus
pspFound bool
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
updateErr error
expectFailedContainerName string
}{
{
description: "cpu manager reconclie - no error",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: state.ContainerCPUAssignments{
"fakeID": cpuset.NewCPUSet(1, 2),
},
stDefaultCPUSet: cpuset.NewCPUSet(3, 4, 5, 6, 7),
updateErr: nil,
expectFailedContainerName: "",
},
{
description: "cpu manager reconclie - pod status not found",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{},
pspFound: false,
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - container id not found",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName1",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - cpuset is empty",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: state.ContainerCPUAssignments{
"fakeID": cpuset.NewCPUSet(),
},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - container update error",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: state.ContainerCPUAssignments{
"fakeID": cpuset.NewCPUSet(1, 2),
},
stDefaultCPUSet: cpuset.NewCPUSet(3, 4, 5, 6, 7),
updateErr: fmt.Errorf("fake container update error"),
expectFailedContainerName: "fakeName",
},
}
for _, testCase := range testCases {
mgr := &manager{
policy: &mockPolicy{
err: nil,
},
state: &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
},
containerRuntime: mockRuntimeService{
err: testCase.updateErr,
},
activePods: func() []*v1.Pod {
return testCase.activePods
},
podStatusProvider: mockPodStatusProvider{
podStatus: testCase.pspPS,
found: testCase.pspFound,
},
}
_, failure := mgr.reconcileState()
if testCase.expectFailedContainerName != "" {
// Search failed reconciled containers for the supplied name.
foundFailedContainer := false
for _, reconciled := range failure {
if reconciled.containerName == testCase.expectFailedContainerName {
foundFailedContainer = true
break
}
}
if !foundFailedContainer {
t.Errorf("Expected reconciliation failure for container: %s", testCase.expectFailedContainerName)
}
}
}
}

View File

@ -0,0 +1,58 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) {
glog.Info("[fake cpumanager] Start()")
}
func (m *fakeManager) Policy() Policy {
glog.Info("[fake cpumanager] Policy()")
return NewNonePolicy()
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
glog.Infof("[fake cpumanager] AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
return nil
}
func (m *fakeManager) RemoveContainer(containerID string) error {
glog.Infof("[fake cpumanager] RemoveContainer (container id: %s)", containerID)
return nil
}
func (m *fakeManager) State() state.Reader {
return m.state
}
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View File

@ -0,0 +1,30 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
)
// Policy implements logic for pod container to CPU assignment.
type Policy interface {
Name() string
Start(s state.State)
AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error
RemoveContainer(s state.State, containerID string) error
}

View File

@ -0,0 +1,51 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
)
type nonePolicy struct{}
var _ Policy = &nonePolicy{}
// PolicyNone name of none policy
const PolicyNone policyName = "none"
// NewNonePolicy returns a cupset manager policy that does nothing
func NewNonePolicy() Policy {
return &nonePolicy{}
}
func (p *nonePolicy) Name() string {
return string(PolicyNone)
}
func (p *nonePolicy) Start(s state.State) {
glog.Info("[cpumanager] none policy: Start")
}
func (p *nonePolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (p *nonePolicy) RemoveContainer(s state.State, containerID string) error {
return nil
}

View File

@ -0,0 +1,65 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"testing"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
func TestNonePolicyName(t *testing.T) {
policy := &nonePolicy{}
policyName := policy.Name()
if policyName != "none" {
t.Errorf("NonePolicy Name() error. expected: none, returned: %v",
policyName)
}
}
func TestNonePolicyAdd(t *testing.T) {
policy := &nonePolicy{}
st := &mockState{
assignments: state.ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
}
testPod := makePod("1000m", "1000m")
container := &testPod.Spec.Containers[0]
err := policy.AddContainer(st, testPod, container, "fakeID")
if err != nil {
t.Errorf("NonePolicy AddContainer() error. expected no error but got: %v", err)
}
}
func TestNonePolicyRemove(t *testing.T) {
policy := &nonePolicy{}
st := &mockState{
assignments: state.ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
}
err := policy.RemoveContainer(st, "fakeID")
if err != nil {
t.Errorf("NonePolicy RemoveContainer() error. expected no error but got %v", err)
}
}

View File

@ -0,0 +1,208 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"github.com/golang/glog"
"k8s.io/api/core/v1"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// PolicyStatic is the name of the static policy
const PolicyStatic policyName = "static"
var _ Policy = &staticPolicy{}
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
//
// This policy allocates CPUs exclusively for a container if all the following
// conditions are met:
//
// - The pod QoS class is Guaranteed.
// - The CPU request is a positive integer.
//
// The static policy maintains the following sets of logical CPUs:
//
// - SHARED: Burstable, BestEffort, and non-integral Guaranteed containers
// run here. Initially this contains all CPU IDs on the system. As
// exclusive allocations are created and destroyed, this CPU set shrinks
// and grows, accordingly. This is stored in the state as the default
// CPU set.
//
// - RESERVED: A subset of the shared pool which is not exclusively
// allocatable. The membership of this pool is static for the lifetime of
// the Kubelet. The size of the reserved pool is
// ceil(systemreserved.cpu + kubereserved.cpu).
// Reserved CPUs are taken topologically starting with lowest-indexed
// physical core, as reported by cAdvisor.
//
// - ASSIGNABLE: Equal to SHARED - RESERVED. Exclusive CPUs are allocated
// from this pool.
//
// - EXCLUSIVE ALLOCATIONS: CPU sets assigned exclusively to one container.
// These are stored as explicit assignments in the state.
//
// When an exclusive allocation is made, the static policy also updates the
// default cpuset in the state abstraction. The CPU manager's periodic
// reconcile loop takes care of rewriting the cpuset in cgroupfs for any
// containers that may be running in the shared pool. For this reason,
// applications running within exclusively-allocated containers must tolerate
// potentially sharing their allocated CPUs for up to the CPU manager
// reconcile period.
type staticPolicy struct {
// cpu socket topology
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
}
// Ensure staticPolicy implements Policy interface
var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int) Policy {
allCPUs := topology.CPUDetails.CPUs()
// takeByTopology allocates CPUs associated with low-numbered cores from
// allCPUs.
//
// For example: Given a system with 8 CPUs available and HT enabled,
// if numReservedCPUs=2, then reserved={0,4}
reserved, _ := takeByTopology(topology, allCPUs, numReservedCPUs)
if reserved.Size() != numReservedCPUs {
panic(fmt.Sprintf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs))
}
glog.Infof("[cpumanager] reserved %d CPUs (\"%s\") not available for exclusive assignment", reserved.Size(), reserved)
return &staticPolicy{
topology: topology,
reserved: reserved,
}
}
func (p *staticPolicy) Name() string {
return string(PolicyStatic)
}
func (p *staticPolicy) Start(s state.State) {
if err := p.validateState(s); err != nil {
glog.Errorf("[cpumanager] static policy invalid state: %s\n", err.Error())
panic("[cpumanager] - please drain node and remove policy state file")
}
}
func (p *staticPolicy) validateState(s state.State) error {
tmpAssignments := s.GetCPUAssignments()
tmpDefaultCPUset := s.GetDefaultCPUSet()
// Default cpuset cannot be empty when assignments exist
if tmpDefaultCPUset.IsEmpty() {
if len(tmpAssignments) != 0 {
return fmt.Errorf("default cpuset cannot be empty")
}
// state is empty initialize
allCPUs := p.topology.CPUDetails.CPUs()
s.SetDefaultCPUSet(allCPUs)
return nil
}
// State has already been initialized from file (is not empty)
// 1 Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
p.reserved.String(), tmpDefaultCPUset.String())
}
// 2. Check if state for static policy is consistent
for cID, cset := range tmpAssignments {
// None of the cpu in DEFAULT cset should be in s.assignments
if !tmpDefaultCPUset.Intersection(cset).IsEmpty() {
return fmt.Errorf("container id: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"",
cID, cset.String(), tmpDefaultCPUset.String())
}
}
return nil
}
// assignableCPUs returns the set of unassigned CPUs minus the reserved set.
func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reserved)
}
func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
if numCPUs := guaranteedCPUs(pod, container); numCPUs != 0 {
// container belongs in an exclusively allocated pool
cpuset, err := p.allocateCPUs(s, numCPUs)
if err != nil {
glog.Errorf("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)
return err
}
s.SetCPUSet(containerID, cpuset)
}
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
func (p *staticPolicy) RemoveContainer(s state.State, containerID string) error {
glog.Infof("[cpumanager] static policy: RemoveContainer (container id: %s)", containerID)
if toRelease, ok := s.GetCPUSet(containerID); ok {
s.Delete(containerID)
// Mutate the shared pool, adding released cpus.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
}
return nil
}
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int) (cpuset.CPUSet, error) {
glog.Infof("[cpumanager] allocateCpus: (numCPUs: %d)", numCPUs)
result, err := takeByTopology(p.topology, p.assignableCPUs(s), numCPUs)
if err != nil {
return cpuset.NewCPUSet(), err
}
// Remove allocated CPUs from the shared CPUSet.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
glog.Infof("[cpumanager] allocateCPUs: returning \"%v\"", result)
return result, nil
}
func guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return 0
}
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
return int(cpuQuantity.Value())
}

View File

@ -0,0 +1,494 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
containerID string
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
expErr error
expCPUAlloc bool
expCSet cpuset.CPUSet
expPanic bool
}
func TestStaticPolicyName(t *testing.T) {
policy := NewStaticPolicy(topoSingleSocketHT, 1)
policyName := policy.Name()
if policyName != "static" {
t.Errorf("StaticPolicy Name() error. expected: static, returned: %v",
policyName)
}
}
func TestStaticPolicyStart(t *testing.T) {
testCases := []staticPolicyTest{
{
description: "non-corrupted state",
topo: topoDualSocketHT,
stAssignments: state.ContainerCPUAssignments{
"0": cpuset.NewCPUSet(0),
},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
{
description: "empty cpuset",
topo: topoDualSocketHT,
numReservedCPUs: 1,
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
numReservedCPUs: 2,
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1),
expPanic: true,
},
{
description: "assigned core 2 is still present in available cpuset",
topo: topoDualSocketHT,
stAssignments: state.ContainerCPUAssignments{
"0": cpuset.NewCPUSet(0, 1, 2),
},
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
expPanic: true,
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
defer func() {
if err := recover(); err != nil {
if !testCase.expPanic {
t.Errorf("unexpected panic occured: %q", err)
}
} else if testCase.expPanic {
t.Error("expected panic doesn't occured")
}
}()
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs).(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
}
policy.Start(st)
if !testCase.stDefaultCPUSet.IsEmpty() {
for cpuid := 1; cpuid < policy.topology.NumCPUs; cpuid++ {
if !st.defaultCPUSet.Contains(cpuid) {
t.Errorf("StaticPolicy Start() error. expected cpuid %d to be present in defaultCPUSet", cpuid)
}
}
}
if !st.GetDefaultCPUSet().Equals(testCase.expCSet) {
t.Errorf("State CPUSet is different than expected. Have %q wants: %q", st.GetDefaultCPUSet(),
testCase.expCSet)
}
})
}
}
func TestStaticPolicyAdd(t *testing.T) {
largeTopoBuilder := cpuset.NewBuilder()
largeTopoSock0Builder := cpuset.NewBuilder()
largeTopoSock1Builder := cpuset.NewBuilder()
largeTopo := *topoQuadSocketFourWayHT
for cpuid, val := range largeTopo.CPUDetails {
largeTopoBuilder.Add(cpuid)
if val.SocketID == 0 {
largeTopoSock0Builder.Add(cpuid)
} else if val.SocketID == 1 {
largeTopoSock1Builder.Add(cpuid)
}
}
largeTopoCPUSet := largeTopoBuilder.Result()
largeTopoSock0CPUSet := largeTopoSock0Builder.Result()
largeTopoSock1CPUSet := largeTopoSock1Builder.Result()
testCases := []staticPolicyTest{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID2",
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID2",
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4), // expect sibling of partial core
},
{
description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(2, 3, 6, 7),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 4, 5),
pod: makePod("2000m", "2000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 5),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocOneSocket",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(2),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("6000m", "6000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 5, 7, 9, 11),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocThreeCores",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(1, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 2, 3, 4, 6, 7, 8, 9, 10, 11),
pod: makePod("6000m", "6000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(2, 3, 4, 8, 9, 10),
},
{
description: "GuPodMultipleCores, DualSocketNoHT, ExpectAllocOneSocket",
topo: topoDualSocketNoHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7),
pod: makePod("4000m", "4000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4, 5, 6, 7),
},
{
description: "GuPodMultipleCores, DualSocketNoHT, ExpectAllocFourCores",
topo: topoDualSocketNoHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(4, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 6, 7),
pod: makePod("4000m", "4000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 6, 7),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocOneSocketOneCore",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(2),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("8000m", "8000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 4, 5, 7, 9, 10, 11),
},
{
description: "NonGuPod, SingleSocketHT, NoAlloc",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("1000m", "2000m"),
expErr: nil,
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodNonIntegerCore, SingleSocketHT, NoAlloc",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID4",
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("977m", "977m"),
expErr: nil,
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodMultipleCores, SingleSocketHT, NoAllocExpectError",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(1, 2, 3, 4, 5, 6),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 7),
pod: makePod("2000m", "2000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodMultipleCores, DualSocketHT, NoAllocExpectError",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(1, 2, 3),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("10000m", "10000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
// All the CPUs from Socket 0 are available. Some CPUs from each
// Socket have been already assigned.
// Expect all CPUs from Socket 0.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocSock0",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": cpuset.NewCPUSet(3, 11, 4, 5, 6, 7),
},
stDefaultCPUSet: largeTopoCPUSet.Difference(cpuset.NewCPUSet(3, 11, 4, 5, 6, 7)),
pod: makePod("72000m", "72000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: largeTopoSock0CPUSet,
},
{
// Only 2 full cores from three Sockets and some partial cores are available.
// Expect CPUs from the 2 full cores available from the three Sockets.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocAllFullCoresFromThreeSockets",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(1, 25, 13, 38, 2, 9, 11, 35, 23, 48, 12, 51,
53, 173, 113, 233, 54, 61)),
},
stDefaultCPUSet: cpuset.NewCPUSet(1, 25, 13, 38, 2, 9, 11, 35, 23, 48, 12, 51, 53, 173, 113, 233, 54, 61),
pod: makePod("12000m", "12000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 25, 13, 38, 11, 35, 23, 48, 53, 173, 113, 233),
},
{
// All CPUs from Socket 1, 1 full core and some partial cores are available.
// Expect all CPUs from Socket 1 and the hyper-threads from the full core.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocAllSock1+FullCore",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": largeTopoCPUSet.Difference(largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47, 53,
173, 61, 181, 108, 228, 115, 235))),
},
stDefaultCPUSet: largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47, 53, 173, 61, 181, 108, 228,
115, 235)),
pod: makePod("76000m", "76000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47)),
},
{
// Only partial cores are available in the entire system.
// Expect allocation of all the CPUs from the partial cores.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocCPUs",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52)),
},
stDefaultCPUSet: cpuset.NewCPUSet(10, 11, 53, 67, 52),
pod: makePod("5000m", "5000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(10, 11, 53, 67, 52),
},
{
// Only 7 CPUs are available.
// Pod requests 76 cores.
// Error is expect since available CPUs are less than the request.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, NoAlloc",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: state.ContainerCPUAssignments{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52)),
},
stDefaultCPUSet: cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52),
pod: makePod("76000m", "76000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
}
for _, testCase := range testCases {
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs)
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
}
container := &testCase.pod.Spec.Containers[0]
err := policy.AddContainer(st, testCase.pod, container, testCase.containerID)
if !reflect.DeepEqual(err, testCase.expErr) {
t.Errorf("StaticPolicy AddContainer() error (%v). expected add error: %v but got: %v",
testCase.description, testCase.expErr, err)
}
if testCase.expCPUAlloc {
cset, found := st.assignments[testCase.containerID]
if !found {
t.Errorf("StaticPolicy AddContainer() error (%v). expected container id %v to be present in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
if !reflect.DeepEqual(cset, testCase.expCSet) {
t.Errorf("StaticPolicy AddContainer() error (%v). expected cpuset %v but got %v",
testCase.description, testCase.expCSet, cset)
}
if !cset.Intersection(st.defaultCPUSet).IsEmpty() {
t.Errorf("StaticPolicy AddContainer() error (%v). expected cpuset %v to be disoint from the shared cpuset %v",
testCase.description, cset, st.defaultCPUSet)
}
}
if !testCase.expCPUAlloc {
_, found := st.assignments[testCase.containerID]
if found {
t.Errorf("StaticPolicy AddContainer() error (%v). Did not expect container id %v to be present in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
}
}
}
func TestStaticPolicyRemove(t *testing.T) {
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{
"fakeID1": cpuset.NewCPUSet(1, 2, 3),
},
stDefaultCPUSet: cpuset.NewCPUSet(4, 5, 6, 7),
expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
},
{
description: "SingleSocketHT, DeAllocOneContainer, BeginEmpty",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{
"fakeID1": cpuset.NewCPUSet(1, 2, 3),
"fakeID2": cpuset.NewCPUSet(4, 5, 6, 7),
},
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(1, 2, 3),
},
{
description: "SingleSocketHT, DeAllocTwoContainer",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: state.ContainerCPUAssignments{
"fakeID1": cpuset.NewCPUSet(1, 3, 5),
"fakeID2": cpuset.NewCPUSet(2, 4),
},
stDefaultCPUSet: cpuset.NewCPUSet(6, 7),
expCSet: cpuset.NewCPUSet(1, 3, 5, 6, 7),
},
{
description: "SingleSocketHT, NoDeAlloc",
topo: topoSingleSocketHT,
containerID: "fakeID2",
stAssignments: state.ContainerCPUAssignments{
"fakeID1": cpuset.NewCPUSet(1, 3, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(2, 4, 6, 7),
expCSet: cpuset.NewCPUSet(2, 4, 6, 7),
},
}
for _, testCase := range testCases {
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs)
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
}
policy.RemoveContainer(st, testCase.containerID)
if !reflect.DeepEqual(st.defaultCPUSet, testCase.expCSet) {
t.Errorf("StaticPolicy RemoveContainer() error (%v). expected default cpuset %v but got %v",
testCase.description, testCase.expCSet, st.defaultCPUSet)
}
if _, found := st.assignments[testCase.containerID]; found {
t.Errorf("StaticPolicy RemoveContainer() error (%v). expected containerID %v not be in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
}
}

View File

@ -0,0 +1,390 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
)
var (
topoSingleSocketHT = &topology.CPUTopology{
NumCPUs: 8,
NumSockets: 1,
NumCores: 4,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 0, SocketID: 0},
5: {CoreID: 1, SocketID: 0},
6: {CoreID: 2, SocketID: 0},
7: {CoreID: 3, SocketID: 0},
},
}
topoDualSocketHT = &topology.CPUTopology{
NumCPUs: 12,
NumSockets: 2,
NumCores: 6,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 1},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 1},
4: {CoreID: 4, SocketID: 0},
5: {CoreID: 5, SocketID: 1},
6: {CoreID: 0, SocketID: 0},
7: {CoreID: 1, SocketID: 1},
8: {CoreID: 2, SocketID: 0},
9: {CoreID: 3, SocketID: 1},
10: {CoreID: 4, SocketID: 0},
11: {CoreID: 5, SocketID: 1},
},
}
topoDualSocketNoHT = &topology.CPUTopology{
NumCPUs: 8,
NumSockets: 2,
NumCores: 8,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 4, SocketID: 1},
5: {CoreID: 5, SocketID: 1},
6: {CoreID: 6, SocketID: 1},
7: {CoreID: 7, SocketID: 1},
},
}
/*
Topology from https://www.open-mpi.org/projects/hwloc/lstopo/images/KNL.SNC4.H50.v1.11.png.
Socket0:
0-2,9-10,13-14,21-22,25-26,33-34,38-39,46-47,50,57-58,71-72,79-80,87-88,95-96,103-104,109-110,117-118,
131-132,139-140,147-148,155-156,163-164,169-170,177-178,191-192,199-200,207-208,215-216,223-224,229-230,
237-238,251-252,259-260,267-268,275-276,283-284
Socket1:
3-4,11-12,15-16,23-24,27-28,35-36,40-41,48-49,51-52,59-60,65-66,73-74,81-82,89-90,97-98,111-112,119-120,125-126,
133-134,141-142,149-150,157-158,171-172,179-180,185-186,193-194,201-202,209-210,217-218,231-232,239-240,245-246,
253-254,261-262,269-270,277-278
Socket2:
5-6,17-18,29-30,42-43,53-54,61-62,67-68,75-76,83-84,91-92,99-100,105-106,113-114,121-122,127-128,135-136,
143-144,151-152,159-160,165-166,173-174,181-182,187-188,195-196,203-204,211-212,219-220,225-226,233-234,241-242,
247-248,255-256,263-264,271-272,279-280,285-286
Socket3:
7-8,19-20,31-32,37,44-45,55-56,63-64,69-70,77-78,85-86,93-94,101-102,107-108,115-116,123-124,129-130,137-138,
145-146,153-154,161-162,167-168,175-176,183-184,189-190,197-198,205-206,213-214,221-222,227-228,235-236,243-244,
249-250,257-258,265-266,273-274,281-282,287
*/
topoQuadSocketFourWayHT = &topology.CPUTopology{
NumCPUs: 288,
NumSockets: 4,
NumCores: 72,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
169: {CoreID: 0, SocketID: 0},
109: {CoreID: 0, SocketID: 0},
229: {CoreID: 0, SocketID: 0},
50: {CoreID: 1, SocketID: 0},
170: {CoreID: 1, SocketID: 0},
110: {CoreID: 1, SocketID: 0},
230: {CoreID: 1, SocketID: 0},
1: {CoreID: 64, SocketID: 0},
25: {CoreID: 64, SocketID: 0},
13: {CoreID: 64, SocketID: 0},
38: {CoreID: 64, SocketID: 0},
2: {CoreID: 65, SocketID: 0},
26: {CoreID: 65, SocketID: 0},
14: {CoreID: 65, SocketID: 0},
39: {CoreID: 65, SocketID: 0},
9: {CoreID: 72, SocketID: 0},
33: {CoreID: 72, SocketID: 0},
21: {CoreID: 72, SocketID: 0},
46: {CoreID: 72, SocketID: 0},
10: {CoreID: 73, SocketID: 0},
34: {CoreID: 73, SocketID: 0},
22: {CoreID: 73, SocketID: 0},
47: {CoreID: 73, SocketID: 0},
57: {CoreID: 8, SocketID: 0},
177: {CoreID: 8, SocketID: 0},
117: {CoreID: 8, SocketID: 0},
237: {CoreID: 8, SocketID: 0},
58: {CoreID: 9, SocketID: 0},
178: {CoreID: 9, SocketID: 0},
118: {CoreID: 9, SocketID: 0},
238: {CoreID: 9, SocketID: 0},
71: {CoreID: 24, SocketID: 0},
191: {CoreID: 24, SocketID: 0},
131: {CoreID: 24, SocketID: 0},
251: {CoreID: 24, SocketID: 0},
72: {CoreID: 25, SocketID: 0},
192: {CoreID: 25, SocketID: 0},
132: {CoreID: 25, SocketID: 0},
252: {CoreID: 25, SocketID: 0},
79: {CoreID: 32, SocketID: 0},
199: {CoreID: 32, SocketID: 0},
139: {CoreID: 32, SocketID: 0},
259: {CoreID: 32, SocketID: 0},
80: {CoreID: 33, SocketID: 0},
200: {CoreID: 33, SocketID: 0},
140: {CoreID: 33, SocketID: 0},
260: {CoreID: 33, SocketID: 0},
87: {CoreID: 40, SocketID: 0},
207: {CoreID: 40, SocketID: 0},
147: {CoreID: 40, SocketID: 0},
267: {CoreID: 40, SocketID: 0},
88: {CoreID: 41, SocketID: 0},
208: {CoreID: 41, SocketID: 0},
148: {CoreID: 41, SocketID: 0},
268: {CoreID: 41, SocketID: 0},
95: {CoreID: 48, SocketID: 0},
215: {CoreID: 48, SocketID: 0},
155: {CoreID: 48, SocketID: 0},
275: {CoreID: 48, SocketID: 0},
96: {CoreID: 49, SocketID: 0},
216: {CoreID: 49, SocketID: 0},
156: {CoreID: 49, SocketID: 0},
276: {CoreID: 49, SocketID: 0},
103: {CoreID: 56, SocketID: 0},
223: {CoreID: 56, SocketID: 0},
163: {CoreID: 56, SocketID: 0},
283: {CoreID: 56, SocketID: 0},
104: {CoreID: 57, SocketID: 0},
224: {CoreID: 57, SocketID: 0},
164: {CoreID: 57, SocketID: 0},
284: {CoreID: 57, SocketID: 0},
3: {CoreID: 66, SocketID: 1},
27: {CoreID: 66, SocketID: 1},
15: {CoreID: 66, SocketID: 1},
40: {CoreID: 66, SocketID: 1},
4: {CoreID: 67, SocketID: 1},
28: {CoreID: 67, SocketID: 1},
16: {CoreID: 67, SocketID: 1},
41: {CoreID: 67, SocketID: 1},
11: {CoreID: 74, SocketID: 1},
35: {CoreID: 74, SocketID: 1},
23: {CoreID: 74, SocketID: 1},
48: {CoreID: 74, SocketID: 1},
12: {CoreID: 75, SocketID: 1},
36: {CoreID: 75, SocketID: 1},
24: {CoreID: 75, SocketID: 1},
49: {CoreID: 75, SocketID: 1},
51: {CoreID: 2, SocketID: 1},
171: {CoreID: 2, SocketID: 1},
111: {CoreID: 2, SocketID: 1},
231: {CoreID: 2, SocketID: 1},
52: {CoreID: 3, SocketID: 1},
172: {CoreID: 3, SocketID: 1},
112: {CoreID: 3, SocketID: 1},
232: {CoreID: 3, SocketID: 1},
59: {CoreID: 10, SocketID: 1},
179: {CoreID: 10, SocketID: 1},
119: {CoreID: 10, SocketID: 1},
239: {CoreID: 10, SocketID: 1},
60: {CoreID: 11, SocketID: 1},
180: {CoreID: 11, SocketID: 1},
120: {CoreID: 11, SocketID: 1},
240: {CoreID: 11, SocketID: 1},
65: {CoreID: 18, SocketID: 1},
185: {CoreID: 18, SocketID: 1},
125: {CoreID: 18, SocketID: 1},
245: {CoreID: 18, SocketID: 1},
66: {CoreID: 19, SocketID: 1},
186: {CoreID: 19, SocketID: 1},
126: {CoreID: 19, SocketID: 1},
246: {CoreID: 19, SocketID: 1},
73: {CoreID: 26, SocketID: 1},
193: {CoreID: 26, SocketID: 1},
133: {CoreID: 26, SocketID: 1},
253: {CoreID: 26, SocketID: 1},
74: {CoreID: 27, SocketID: 1},
194: {CoreID: 27, SocketID: 1},
134: {CoreID: 27, SocketID: 1},
254: {CoreID: 27, SocketID: 1},
81: {CoreID: 34, SocketID: 1},
201: {CoreID: 34, SocketID: 1},
141: {CoreID: 34, SocketID: 1},
261: {CoreID: 34, SocketID: 1},
82: {CoreID: 35, SocketID: 1},
202: {CoreID: 35, SocketID: 1},
142: {CoreID: 35, SocketID: 1},
262: {CoreID: 35, SocketID: 1},
89: {CoreID: 42, SocketID: 1},
209: {CoreID: 42, SocketID: 1},
149: {CoreID: 42, SocketID: 1},
269: {CoreID: 42, SocketID: 1},
90: {CoreID: 43, SocketID: 1},
210: {CoreID: 43, SocketID: 1},
150: {CoreID: 43, SocketID: 1},
270: {CoreID: 43, SocketID: 1},
97: {CoreID: 50, SocketID: 1},
217: {CoreID: 50, SocketID: 1},
157: {CoreID: 50, SocketID: 1},
277: {CoreID: 50, SocketID: 1},
98: {CoreID: 51, SocketID: 1},
218: {CoreID: 51, SocketID: 1},
158: {CoreID: 51, SocketID: 1},
278: {CoreID: 51, SocketID: 1},
5: {CoreID: 68, SocketID: 2},
29: {CoreID: 68, SocketID: 2},
17: {CoreID: 68, SocketID: 2},
42: {CoreID: 68, SocketID: 2},
6: {CoreID: 69, SocketID: 2},
30: {CoreID: 69, SocketID: 2},
18: {CoreID: 69, SocketID: 2},
43: {CoreID: 69, SocketID: 2},
53: {CoreID: 4, SocketID: 2},
173: {CoreID: 4, SocketID: 2},
113: {CoreID: 4, SocketID: 2},
233: {CoreID: 4, SocketID: 2},
54: {CoreID: 5, SocketID: 2},
174: {CoreID: 5, SocketID: 2},
114: {CoreID: 5, SocketID: 2},
234: {CoreID: 5, SocketID: 2},
61: {CoreID: 12, SocketID: 2},
181: {CoreID: 12, SocketID: 2},
121: {CoreID: 12, SocketID: 2},
241: {CoreID: 12, SocketID: 2},
62: {CoreID: 13, SocketID: 2},
182: {CoreID: 13, SocketID: 2},
122: {CoreID: 13, SocketID: 2},
242: {CoreID: 13, SocketID: 2},
67: {CoreID: 20, SocketID: 2},
187: {CoreID: 20, SocketID: 2},
127: {CoreID: 20, SocketID: 2},
247: {CoreID: 20, SocketID: 2},
68: {CoreID: 21, SocketID: 2},
188: {CoreID: 21, SocketID: 2},
128: {CoreID: 21, SocketID: 2},
248: {CoreID: 21, SocketID: 2},
75: {CoreID: 28, SocketID: 2},
195: {CoreID: 28, SocketID: 2},
135: {CoreID: 28, SocketID: 2},
255: {CoreID: 28, SocketID: 2},
76: {CoreID: 29, SocketID: 2},
196: {CoreID: 29, SocketID: 2},
136: {CoreID: 29, SocketID: 2},
256: {CoreID: 29, SocketID: 2},
83: {CoreID: 36, SocketID: 2},
203: {CoreID: 36, SocketID: 2},
143: {CoreID: 36, SocketID: 2},
263: {CoreID: 36, SocketID: 2},
84: {CoreID: 37, SocketID: 2},
204: {CoreID: 37, SocketID: 2},
144: {CoreID: 37, SocketID: 2},
264: {CoreID: 37, SocketID: 2},
91: {CoreID: 44, SocketID: 2},
211: {CoreID: 44, SocketID: 2},
151: {CoreID: 44, SocketID: 2},
271: {CoreID: 44, SocketID: 2},
92: {CoreID: 45, SocketID: 2},
212: {CoreID: 45, SocketID: 2},
152: {CoreID: 45, SocketID: 2},
272: {CoreID: 45, SocketID: 2},
99: {CoreID: 52, SocketID: 2},
219: {CoreID: 52, SocketID: 2},
159: {CoreID: 52, SocketID: 2},
279: {CoreID: 52, SocketID: 2},
100: {CoreID: 53, SocketID: 2},
220: {CoreID: 53, SocketID: 2},
160: {CoreID: 53, SocketID: 2},
280: {CoreID: 53, SocketID: 2},
105: {CoreID: 60, SocketID: 2},
225: {CoreID: 60, SocketID: 2},
165: {CoreID: 60, SocketID: 2},
285: {CoreID: 60, SocketID: 2},
106: {CoreID: 61, SocketID: 2},
226: {CoreID: 61, SocketID: 2},
166: {CoreID: 61, SocketID: 2},
286: {CoreID: 61, SocketID: 2},
7: {CoreID: 70, SocketID: 3},
31: {CoreID: 70, SocketID: 3},
19: {CoreID: 70, SocketID: 3},
44: {CoreID: 70, SocketID: 3},
8: {CoreID: 71, SocketID: 3},
32: {CoreID: 71, SocketID: 3},
20: {CoreID: 71, SocketID: 3},
45: {CoreID: 71, SocketID: 3},
37: {CoreID: 63, SocketID: 3},
168: {CoreID: 63, SocketID: 3},
108: {CoreID: 63, SocketID: 3},
228: {CoreID: 63, SocketID: 3},
107: {CoreID: 62, SocketID: 3},
227: {CoreID: 62, SocketID: 3},
167: {CoreID: 62, SocketID: 3},
287: {CoreID: 62, SocketID: 3},
55: {CoreID: 6, SocketID: 3},
175: {CoreID: 6, SocketID: 3},
115: {CoreID: 6, SocketID: 3},
235: {CoreID: 6, SocketID: 3},
56: {CoreID: 7, SocketID: 3},
176: {CoreID: 7, SocketID: 3},
116: {CoreID: 7, SocketID: 3},
236: {CoreID: 7, SocketID: 3},
63: {CoreID: 14, SocketID: 3},
183: {CoreID: 14, SocketID: 3},
123: {CoreID: 14, SocketID: 3},
243: {CoreID: 14, SocketID: 3},
64: {CoreID: 15, SocketID: 3},
184: {CoreID: 15, SocketID: 3},
124: {CoreID: 15, SocketID: 3},
244: {CoreID: 15, SocketID: 3},
69: {CoreID: 22, SocketID: 3},
189: {CoreID: 22, SocketID: 3},
129: {CoreID: 22, SocketID: 3},
249: {CoreID: 22, SocketID: 3},
70: {CoreID: 23, SocketID: 3},
190: {CoreID: 23, SocketID: 3},
130: {CoreID: 23, SocketID: 3},
250: {CoreID: 23, SocketID: 3},
77: {CoreID: 30, SocketID: 3},
197: {CoreID: 30, SocketID: 3},
137: {CoreID: 30, SocketID: 3},
257: {CoreID: 30, SocketID: 3},
78: {CoreID: 31, SocketID: 3},
198: {CoreID: 31, SocketID: 3},
138: {CoreID: 31, SocketID: 3},
258: {CoreID: 31, SocketID: 3},
85: {CoreID: 38, SocketID: 3},
205: {CoreID: 38, SocketID: 3},
145: {CoreID: 38, SocketID: 3},
265: {CoreID: 38, SocketID: 3},
86: {CoreID: 39, SocketID: 3},
206: {CoreID: 39, SocketID: 3},
146: {CoreID: 39, SocketID: 3},
266: {CoreID: 39, SocketID: 3},
93: {CoreID: 46, SocketID: 3},
213: {CoreID: 46, SocketID: 3},
153: {CoreID: 46, SocketID: 3},
273: {CoreID: 46, SocketID: 3},
94: {CoreID: 47, SocketID: 3},
214: {CoreID: 47, SocketID: 3},
154: {CoreID: 47, SocketID: 3},
274: {CoreID: 47, SocketID: 3},
101: {CoreID: 54, SocketID: 3},
221: {CoreID: 54, SocketID: 3},
161: {CoreID: 54, SocketID: 3},
281: {CoreID: 54, SocketID: 3},
102: {CoreID: 55, SocketID: 3},
222: {CoreID: 55, SocketID: 3},
162: {CoreID: 55, SocketID: 3},
282: {CoreID: 55, SocketID: 3},
},
}
)

View File

@ -0,0 +1,38 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"state.go",
"state_file.go",
"state_mem.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state",
visibility = ["//visibility:public"],
deps = [
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = ["state_file_test.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state",
library = ":go_default_library",
deps = ["//pkg/kubelet/cm/cpuset:go_default_library"],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,55 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// ContainerCPUAssignments type used in cpu manger state
type ContainerCPUAssignments map[string]cpuset.CPUSet
// Clone returns a copy of ContainerCPUAssignments
func (as ContainerCPUAssignments) Clone() ContainerCPUAssignments {
ret := make(ContainerCPUAssignments)
for key, val := range as {
ret[key] = val
}
return ret
}
// Reader interface used to read current cpu/pod assignment state
type Reader interface {
GetCPUSet(containerID string) (cpuset.CPUSet, bool)
GetDefaultCPUSet() cpuset.CPUSet
GetCPUSetOrDefault(containerID string) cpuset.CPUSet
GetCPUAssignments() ContainerCPUAssignments
}
type writer interface {
SetCPUSet(containerID string, cpuset cpuset.CPUSet)
SetDefaultCPUSet(cpuset cpuset.CPUSet)
SetCPUAssignments(ContainerCPUAssignments)
Delete(containerID string)
ClearState()
}
// State interface provides methods for tracking and setting cpu/pod assignment
type State interface {
Reader
writer
}

View File

@ -0,0 +1,204 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"fmt"
"github.com/golang/glog"
"io/ioutil"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"os"
"sync"
)
type stateFileData struct {
PolicyName string `json:"policyName"`
DefaultCPUSet string `json:"defaultCpuSet"`
Entries map[string]string `json:"entries,omitempty"`
}
var _ State = &stateFile{}
type stateFile struct {
sync.RWMutex
stateFilePath string
policyName string
cache State
}
// NewFileState creates new State for keeping track of cpu/pod assignment with file backend
func NewFileState(filePath string, policyName string) State {
stateFile := &stateFile{
stateFilePath: filePath,
cache: NewMemoryState(),
policyName: policyName,
}
if err := stateFile.tryRestoreState(); err != nil {
// could not restore state, init new state file
glog.Infof("[cpumanager] state file: initializing empty state file - reason: \"%s\"", err)
stateFile.cache.ClearState()
stateFile.storeState()
}
return stateFile
}
// tryRestoreState tries to read state file, upon any error,
// err message is logged and state is left clean. un-initialized
func (sf *stateFile) tryRestoreState() error {
sf.Lock()
defer sf.Unlock()
var err error
// used when all parsing is ok
tmpAssignments := make(ContainerCPUAssignments)
tmpDefaultCPUSet := cpuset.NewCPUSet()
tmpContainerCPUSet := cpuset.NewCPUSet()
var content []byte
if content, err = ioutil.ReadFile(sf.stateFilePath); os.IsNotExist(err) {
// Create file
if _, err = os.Create(sf.stateFilePath); err != nil {
glog.Errorf("[cpumanager] state file: unable to create state file \"%s\":%s", sf.stateFilePath, err.Error())
panic("[cpumanager] state file not created")
}
glog.Infof("[cpumanager] state file: created empty state file \"%s\"", sf.stateFilePath)
} else {
// File exists - try to read
var readState stateFileData
if err = json.Unmarshal(content, &readState); err != nil {
glog.Warningf("[cpumanager] state file: could not unmarshal, corrupted state file - \"%s\"", sf.stateFilePath)
return err
}
if sf.policyName != readState.PolicyName {
return fmt.Errorf("policy configured \"%s\" != policy from state file \"%s\"", sf.policyName, readState.PolicyName)
}
if tmpDefaultCPUSet, err = cpuset.Parse(readState.DefaultCPUSet); err != nil {
glog.Warningf("[cpumanager] state file: could not parse state file - [defaultCpuSet:\"%s\"]", readState.DefaultCPUSet)
return err
}
for containerID, cpuString := range readState.Entries {
if tmpContainerCPUSet, err = cpuset.Parse(cpuString); err != nil {
glog.Warningf("[cpumanager] state file: could not parse state file - container id: %s, cpuset: \"%s\"", containerID, cpuString)
return err
}
tmpAssignments[containerID] = tmpContainerCPUSet
}
sf.cache.SetDefaultCPUSet(tmpDefaultCPUSet)
sf.cache.SetCPUAssignments(tmpAssignments)
glog.V(2).Infof("[cpumanager] state file: restored state from state file \"%s\"", sf.stateFilePath)
glog.V(2).Infof("[cpumanager] state file: defaultCPUSet: %s", tmpDefaultCPUSet.String())
}
return nil
}
// saves state to a file, caller is responsible for locking
func (sf *stateFile) storeState() {
var content []byte
var err error
data := stateFileData{
PolicyName: sf.policyName,
DefaultCPUSet: sf.cache.GetDefaultCPUSet().String(),
Entries: map[string]string{},
}
for containerID, cset := range sf.cache.GetCPUAssignments() {
data.Entries[containerID] = cset.String()
}
if content, err = json.Marshal(data); err != nil {
panic("[cpumanager] state file: could not serialize state to json")
}
if err = ioutil.WriteFile(sf.stateFilePath, content, 0644); err != nil {
panic("[cpumanager] state file not written")
}
return
}
func (sf *stateFile) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
sf.RLock()
defer sf.RUnlock()
res, ok := sf.cache.GetCPUSet(containerID)
return res, ok
}
func (sf *stateFile) GetDefaultCPUSet() cpuset.CPUSet {
sf.RLock()
defer sf.RUnlock()
return sf.cache.GetDefaultCPUSet()
}
func (sf *stateFile) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
sf.RLock()
defer sf.RUnlock()
return sf.cache.GetCPUSetOrDefault(containerID)
}
func (sf *stateFile) GetCPUAssignments() ContainerCPUAssignments {
sf.RLock()
defer sf.RUnlock()
return sf.cache.GetCPUAssignments()
}
func (sf *stateFile) SetCPUSet(containerID string, cset cpuset.CPUSet) {
sf.Lock()
defer sf.Unlock()
sf.cache.SetCPUSet(containerID, cset)
sf.storeState()
}
func (sf *stateFile) SetDefaultCPUSet(cset cpuset.CPUSet) {
sf.Lock()
defer sf.Unlock()
sf.cache.SetDefaultCPUSet(cset)
sf.storeState()
}
func (sf *stateFile) SetCPUAssignments(a ContainerCPUAssignments) {
sf.Lock()
defer sf.Unlock()
sf.cache.SetCPUAssignments(a)
sf.storeState()
}
func (sf *stateFile) Delete(containerID string) {
sf.Lock()
defer sf.Unlock()
sf.cache.Delete(containerID)
sf.storeState()
}
func (sf *stateFile) ClearState() {
sf.Lock()
defer sf.Unlock()
sf.cache.ClearState()
sf.storeState()
}

View File

@ -0,0 +1,481 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"bytes"
"flag"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"reflect"
"strings"
"testing"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
func writeToStateFile(statefile string, content string) {
ioutil.WriteFile(statefile, []byte(content), 0644)
}
func stateEqual(t *testing.T, sf State, sm State) {
cpusetSf := sf.GetDefaultCPUSet()
cpusetSm := sm.GetDefaultCPUSet()
if !cpusetSf.Equals(cpusetSm) {
t.Errorf("State CPUSet mismatch. Have %v, want %v", cpusetSf, cpusetSm)
}
cpuassignmentSf := sf.GetCPUAssignments()
cpuassignmentSm := sm.GetCPUAssignments()
if !reflect.DeepEqual(cpuassignmentSf, cpuassignmentSm) {
t.Errorf("State CPU assigments mismatch. Have %s, want %s", cpuassignmentSf, cpuassignmentSm)
}
}
func stderrCapture(t *testing.T, f func() State) (bytes.Buffer, State) {
stderr := os.Stderr
readBuffer, writeBuffer, err := os.Pipe()
if err != nil {
t.Errorf("cannot create pipe: %v", err.Error())
}
os.Stderr = writeBuffer
var outputBuffer bytes.Buffer
state := f()
writeBuffer.Close()
io.Copy(&outputBuffer, readBuffer)
os.Stderr = stderr
return outputBuffer, state
}
func TestFileStateTryRestore(t *testing.T) {
flag.Set("alsologtostderr", "true")
flag.Parse()
testCases := []struct {
description string
stateFileContent string
policyName string
expErr string
expectedState *stateMemory
}{
{
"Invalid JSON - empty file",
"\n",
"none",
"state file: could not unmarshal, corrupted state file",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Invalid JSON - invalid content",
"{",
"none",
"state file: could not unmarshal, corrupted state file",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore defaultCPUSet only",
`{"policyName": "none", "defaultCpuSet": "4-6"}`,
"none",
"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(4, 5, 6),
},
},
{
"Try restore defaultCPUSet only - invalid name",
`{"policyName": "none", "defaultCpuSet" "4-6"}`,
"none",
"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore assignments only",
`{
"policyName": "none",
"entries": {
"container1": "4-6",
"container2": "1-3"
}
}`,
"none",
"",
&stateMemory{
assignments: ContainerCPUAssignments{
"container1": cpuset.NewCPUSet(4, 5, 6),
"container2": cpuset.NewCPUSet(1, 2, 3),
},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore invalid policy name",
`{
"policyName": "A",
"defaultCpuSet": "0-7",
"entries": {}
}`,
"B",
"policy configured \"B\" != policy from state file \"A\"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore invalid assignments",
`{"entries": }`,
"none",
"state file: could not unmarshal, corrupted state file",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore valid file",
`{
"policyName": "none",
"defaultCpuSet": "23-24",
"entries": {
"container1": "4-6",
"container2": "1-3"
}
}`,
"none",
"",
&stateMemory{
assignments: ContainerCPUAssignments{
"container1": cpuset.NewCPUSet(4, 5, 6),
"container2": cpuset.NewCPUSet(1, 2, 3),
},
defaultCPUSet: cpuset.NewCPUSet(23, 24),
},
},
{
"Try restore un-parsable defaultCPUSet ",
`{
"policyName": "none",
"defaultCpuSet": "2-sd"
}`,
"none",
"state file: could not parse state file",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Try restore un-parsable assignments",
`{
"policyName": "none",
"defaultCpuSet": "23-24",
"entries": {
"container1": "p-6",
"container2": "1-3"
}
}`,
"none",
"state file: could not parse state file",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"TryRestoreState creates empty state file",
"",
"none",
"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
}
for idx, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
sfilePath, err := ioutil.TempFile("/tmp", fmt.Sprintf("cpumanager_state_file_test_%d", idx))
if err != nil {
t.Errorf("cannot create temporary file: %q", err.Error())
}
// Don't create state file, let TryRestoreState figure out that is should create
if tc.stateFileContent != "" {
writeToStateFile(sfilePath.Name(), tc.stateFileContent)
}
// Always remove file - regardless of who created
defer os.Remove(sfilePath.Name())
logData, fileState := stderrCapture(t, func() State {
return NewFileState(sfilePath.Name(), tc.policyName)
})
if tc.expErr != "" {
if logData.String() != "" {
if !strings.Contains(logData.String(), tc.expErr) {
t.Errorf("TryRestoreState() error = %v, wantErr %v", logData.String(), tc.expErr)
return
}
} else {
t.Errorf("TryRestoreState() error = nil, wantErr %v", tc.expErr)
return
}
}
stateEqual(t, fileState, tc.expectedState)
})
}
}
func TestFileStateTryRestorePanic(t *testing.T) {
testCase := struct {
description string
wantPanic bool
panicMessage string
}{
"Panic creating file",
true,
"[cpumanager] state file not created",
}
t.Run(testCase.description, func(t *testing.T) {
sfilePath := path.Join("/invalid_path/to_some_dir", "cpumanager_state_file_test")
defer func() {
if err := recover(); err != nil {
if testCase.wantPanic {
if testCase.panicMessage == err {
t.Logf("TryRestoreState() got expected panic = %v", err)
return
}
t.Errorf("TryRestoreState() unexpected panic = %v, wantErr %v", err, testCase.panicMessage)
}
}
}()
NewFileState(sfilePath, "static")
})
}
func TestUpdateStateFile(t *testing.T) {
flag.Set("alsologtostderr", "true")
flag.Parse()
testCases := []struct {
description string
expErr string
expectedState *stateMemory
}{
{
"Save empty state",
"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
{
"Save defaultCPUSet only",
"",
&stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(1, 6),
},
},
{
"Save assignments only",
"",
&stateMemory{
assignments: ContainerCPUAssignments{
"container1": cpuset.NewCPUSet(4, 5, 6),
"container2": cpuset.NewCPUSet(1, 2, 3),
},
defaultCPUSet: cpuset.NewCPUSet(),
},
},
}
for idx, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
sfilePath, err := ioutil.TempFile("/tmp", fmt.Sprintf("cpumanager_state_file_test_%d", idx))
defer os.Remove(sfilePath.Name())
if err != nil {
t.Errorf("cannot create temporary file: %q", err.Error())
}
fileState := stateFile{
stateFilePath: sfilePath.Name(),
policyName: "static",
cache: NewMemoryState(),
}
fileState.SetDefaultCPUSet(tc.expectedState.defaultCPUSet)
fileState.SetCPUAssignments(tc.expectedState.assignments)
logData, _ := stderrCapture(t, func() State {
fileState.storeState()
return &stateFile{}
})
errMsg := logData.String()
if tc.expErr != "" {
if errMsg != "" {
if errMsg != tc.expErr {
t.Errorf("UpdateStateFile() error = %v, wantErr %v", errMsg, tc.expErr)
return
}
} else {
t.Errorf("UpdateStateFile() error = nil, wantErr %v", tc.expErr)
return
}
} else {
if errMsg != "" {
t.Errorf("UpdateStateFile() error = %v, wantErr nil", errMsg)
return
}
}
newFileState := NewFileState(sfilePath.Name(), "static")
stateEqual(t, newFileState, tc.expectedState)
})
}
}
func TestHelpersStateFile(t *testing.T) {
testCases := []struct {
description string
defaultCPUset cpuset.CPUSet
containers map[string]cpuset.CPUSet
}{
{
description: "one container",
defaultCPUset: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8),
containers: map[string]cpuset.CPUSet{
"c1": cpuset.NewCPUSet(0, 1),
},
},
{
description: "two containers",
defaultCPUset: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8),
containers: map[string]cpuset.CPUSet{
"c1": cpuset.NewCPUSet(0, 1),
"c2": cpuset.NewCPUSet(2, 3, 4, 5),
},
},
{
description: "container with more cpus than is possible",
defaultCPUset: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8),
containers: map[string]cpuset.CPUSet{
"c1": cpuset.NewCPUSet(0, 10),
},
},
{
description: "container without assigned cpus",
defaultCPUset: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8),
containers: map[string]cpuset.CPUSet{
"c1": cpuset.NewCPUSet(),
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
sfFile, err := ioutil.TempFile("/tmp", "testHelpersStateFile")
defer os.Remove(sfFile.Name())
if err != nil {
t.Errorf("cannot create temporary test file: %q", err.Error())
}
state := NewFileState(sfFile.Name(), "static")
state.SetDefaultCPUSet(tc.defaultCPUset)
for containerName, containerCPUs := range tc.containers {
state.SetCPUSet(containerName, containerCPUs)
if cpus, _ := state.GetCPUSet(containerName); !cpus.Equals(containerCPUs) {
t.Errorf("state is inconsistant. Wants = %q Have = %q", containerCPUs, cpus)
}
state.Delete(containerName)
if cpus := state.GetCPUSetOrDefault(containerName); !cpus.Equals(tc.defaultCPUset) {
t.Error("deleted container still existing in state")
}
}
})
}
}
func TestClearStateStateFile(t *testing.T) {
testCases := []struct {
description string
defaultCPUset cpuset.CPUSet
containers map[string]cpuset.CPUSet
}{
{
description: "valid file",
defaultCPUset: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8),
containers: map[string]cpuset.CPUSet{
"c1": cpuset.NewCPUSet(0, 1),
"c2": cpuset.NewCPUSet(2, 3),
"c3": cpuset.NewCPUSet(4, 5),
},
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
sfFile, err := ioutil.TempFile("/tmp", "testHelpersStateFile")
defer os.Remove(sfFile.Name())
if err != nil {
t.Errorf("cannot create temporary test file: %q", err.Error())
}
state := NewFileState(sfFile.Name(), "static")
state.SetDefaultCPUSet(testCase.defaultCPUset)
for containerName, containerCPUs := range testCase.containers {
state.SetCPUSet(containerName, containerCPUs)
}
state.ClearState()
if !cpuset.NewCPUSet().Equals(state.GetDefaultCPUSet()) {
t.Error("cleared state shoudn't has got information about available cpuset")
}
for containerName := range testCase.containers {
if !cpuset.NewCPUSet().Equals(state.GetCPUSetOrDefault(containerName)) {
t.Error("cleared state shoudn't has got information about containers")
}
}
})
}
}

View File

@ -0,0 +1,113 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type stateMemory struct {
sync.RWMutex
assignments ContainerCPUAssignments
defaultCPUSet cpuset.CPUSet
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
glog.Infof("[cpumanager] initializing new in-memory state store")
return &stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.NewCPUSet(),
}
}
func (s *stateMemory) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
s.RLock()
defer s.RUnlock()
res, ok := s.assignments[containerID]
return res.Clone(), ok
}
func (s *stateMemory) GetDefaultCPUSet() cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
return s.defaultCPUSet.Clone()
}
func (s *stateMemory) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
if res, ok := s.GetCPUSet(containerID); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *stateMemory) GetCPUAssignments() ContainerCPUAssignments {
s.RLock()
defer s.RUnlock()
return s.assignments.Clone()
}
func (s *stateMemory) SetCPUSet(containerID string, cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.assignments[containerID] = cset
glog.Infof("[cpumanager] updated desired cpuset (container id: %s, cpuset: \"%s\")", containerID, cset)
}
func (s *stateMemory) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cset
glog.Infof("[cpumanager] updated default cpuset: \"%s\"", cset)
}
func (s *stateMemory) SetCPUAssignments(a ContainerCPUAssignments) {
s.Lock()
defer s.Unlock()
s.assignments = a.Clone()
glog.Infof("[cpumanager] updated cpuset assignments: \"%v\"", a)
}
func (s *stateMemory) Delete(containerID string) {
s.Lock()
defer s.Unlock()
delete(s.assignments, containerID)
glog.V(2).Infof("[cpumanager] deleted cpuset assignment (container id: %s)", containerID)
}
func (s *stateMemory) ClearState() {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cpuset.CPUSet{}
s.assignments = make(ContainerCPUAssignments)
glog.V(2).Infof("[cpumanager] cleared state")
}

View File

@ -0,0 +1,38 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"doc.go",
"topology.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology",
visibility = ["//visibility:public"],
deps = [
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)
go_test(
name = "go_default_test",
srcs = ["topology_test.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology",
library = ":go_default_library",
deps = ["//vendor/github.com/google/cadvisor/info/v1:go_default_library"],
)

View File

@ -0,0 +1,18 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package topology contains helpers for the CPU manager.
package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"

View File

@ -0,0 +1,197 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"fmt"
"sort"
"github.com/golang/glog"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// CPUDetails is a map from CPU ID to Core ID and Socket ID.
type CPUDetails map[int]CPUInfo
// CPUTopology contains details of node cpu, where :
// CPU - logical CPU, cadvisor - thread
// Core - physical CPU, cadvisor - Core
// Socket - socket, cadvisor - Node
type CPUTopology struct {
NumCPUs int
NumCores int
NumSockets int
CPUDetails CPUDetails
}
// CPUsPerCore returns the number of logical CPUs are associated with
// each core.
func (topo *CPUTopology) CPUsPerCore() int {
if topo.NumCores == 0 {
return 0
}
return topo.NumCPUs / topo.NumCores
}
// CPUsPerSocket returns the number of logical CPUs are associated with
// each socket.
func (topo *CPUTopology) CPUsPerSocket() int {
if topo.NumSockets == 0 {
return 0
}
return topo.NumCPUs / topo.NumSockets
}
// CPUInfo contains the socket and core IDs associated with a CPU.
type CPUInfo struct {
SocketID int
CoreID int
}
// KeepOnly returns a new CPUDetails object with only the supplied cpus.
func (d CPUDetails) KeepOnly(cpus cpuset.CPUSet) CPUDetails {
result := CPUDetails{}
for cpu, info := range d {
if cpus.Contains(cpu) {
result[cpu] = info
}
}
return result
}
// Sockets returns all of the socket IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Sockets() cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
b.Add(info.SocketID)
}
return b.Result()
}
// CPUsInSocket returns all of the logical CPU IDs associated with the
// given socket ID in this CPUDetails.
func (d CPUDetails) CPUsInSocket(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpu, info := range d {
if info.SocketID == id {
b.Add(cpu)
}
}
return b.Result()
}
// Cores returns all of the core IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Cores() cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
b.Add(info.CoreID)
}
return b.Result()
}
// CoresInSocket returns all of the core IDs associated with the given
// socket ID in this CPUDetails.
func (d CPUDetails) CoresInSocket(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
if info.SocketID == id {
b.Add(info.CoreID)
}
}
return b.Result()
}
// CPUs returns all of the logical CPU IDs in this CPUDetails.
func (d CPUDetails) CPUs() cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpuID := range d {
b.Add(cpuID)
}
return b.Result()
}
// CPUsInCore returns all of the logical CPU IDs associated with the
// given core ID in this CPUDetails.
func (d CPUDetails) CPUsInCore(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpu, info := range d {
if info.CoreID == id {
b.Add(cpu)
}
}
return b.Result()
}
// Discover returns CPUTopology based on cadvisor node info
func Discover(machineInfo *cadvisorapi.MachineInfo) (*CPUTopology, error) {
if machineInfo.NumCores == 0 {
return nil, fmt.Errorf("could not detect number of cpus")
}
CPUDetails := CPUDetails{}
numCPUs := machineInfo.NumCores
numPhysicalCores := 0
var coreID int
var err error
for _, socket := range machineInfo.Topology {
numPhysicalCores += len(socket.Cores)
for _, core := range socket.Cores {
if coreID, err = getUniqueCoreID(core.Threads); err != nil {
glog.Errorf("could not get unique coreID for socket: %d core %d threads: %v",
socket.Id, core.Id, core.Threads)
return nil, err
}
for _, cpu := range core.Threads {
CPUDetails[cpu] = CPUInfo{
CoreID: coreID,
SocketID: socket.Id,
}
}
}
}
return &CPUTopology{
NumCPUs: numCPUs,
NumSockets: len(machineInfo.Topology),
NumCores: numPhysicalCores,
CPUDetails: CPUDetails,
}, nil
}
// getUniqueCoreID computes coreId as the lowest cpuID
// for a given Threads []int slice. This will assure that coreID's are
// platform unique (opposite to what cAdvisor reports - socket unique)
func getUniqueCoreID(threads []int) (coreID int, err error) {
err = nil
if len(threads) == 0 {
return 0, fmt.Errorf("no cpus provided")
}
if len(threads) != cpuset.NewCPUSet(threads...).Size() {
return 0, fmt.Errorf("cpus provided are not unique")
}
tmpThreads := make([]int, len(threads))
copy(tmpThreads, threads)
sort.Ints(tmpThreads)
return tmpThreads[0], err
}

View File

@ -0,0 +1,201 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"reflect"
"testing"
cadvisorapi "github.com/google/cadvisor/info/v1"
)
func Test_Discover(t *testing.T) {
tests := []struct {
name string
args *cadvisorapi.MachineInfo
want *CPUTopology
wantErr bool
}{
{
name: "FailNumCores",
args: &cadvisorapi.MachineInfo{
NumCores: 0,
},
want: &CPUTopology{},
wantErr: true,
},
{
name: "OneSocketHT",
args: &cadvisorapi.MachineInfo{
NumCores: 8,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0, 4}},
{Id: 1, Threads: []int{1, 5}},
{Id: 2, Threads: []int{2, 6}},
{Id: 3, Threads: []int{3, 7}},
},
},
},
},
want: &CPUTopology{
NumCPUs: 8,
NumSockets: 1,
NumCores: 4,
CPUDetails: map[int]CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 0, SocketID: 0},
5: {CoreID: 1, SocketID: 0},
6: {CoreID: 2, SocketID: 0},
7: {CoreID: 3, SocketID: 0},
},
},
wantErr: false,
},
{
name: "DualSocketNoHT",
args: &cadvisorapi.MachineInfo{
NumCores: 4,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0}},
{Id: 2, Threads: []int{2}},
},
},
{Id: 1,
Cores: []cadvisorapi.Core{
{Id: 1, Threads: []int{1}},
{Id: 3, Threads: []int{3}},
},
},
},
},
want: &CPUTopology{
NumCPUs: 4,
NumSockets: 2,
NumCores: 4,
CPUDetails: map[int]CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 1},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 1},
},
},
wantErr: false,
},
{
name: "DualSocketHT - non unique Core'ID's",
args: &cadvisorapi.MachineInfo{
NumCores: 12,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0, 6}},
{Id: 1, Threads: []int{1, 7}},
{Id: 2, Threads: []int{2, 8}},
},
},
{Id: 1,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{3, 9}},
{Id: 1, Threads: []int{4, 10}},
{Id: 2, Threads: []int{5, 11}},
},
},
},
},
want: &CPUTopology{
NumCPUs: 12,
NumSockets: 2,
NumCores: 6,
CPUDetails: map[int]CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 1},
4: {CoreID: 4, SocketID: 1},
5: {CoreID: 5, SocketID: 1},
6: {CoreID: 0, SocketID: 0},
7: {CoreID: 1, SocketID: 0},
8: {CoreID: 2, SocketID: 0},
9: {CoreID: 3, SocketID: 1},
10: {CoreID: 4, SocketID: 1},
11: {CoreID: 5, SocketID: 1},
},
},
wantErr: false,
},
{
name: "OneSocketHT fail",
args: &cadvisorapi.MachineInfo{
NumCores: 8,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0, 4}},
{Id: 1, Threads: []int{1, 5}},
{Id: 2, Threads: []int{2, 2}}, // Wrong case - should fail here
{Id: 3, Threads: []int{3, 7}},
},
},
},
},
want: &CPUTopology{},
wantErr: true,
},
{
name: "OneSocketHT fail",
args: &cadvisorapi.MachineInfo{
NumCores: 8,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0, 4}},
{Id: 1, Threads: []int{1, 5}},
{Id: 2, Threads: []int{2, 6}},
{Id: 3, Threads: []int{}}, // Wrong case - should fail here
},
},
},
},
want: &CPUTopology{},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := Discover(tt.args)
if err != nil {
if tt.wantErr {
t.Logf("Discover() expected error = %v", err)
} else {
t.Errorf("Discover() error = %v, wantErr %v", err, tt.wantErr)
}
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("Discover() = %v, want %v", got, tt.want)
}
})
}
}

30
vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpuset/BUILD generated vendored Normal file
View File

@ -0,0 +1,30 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = ["cpuset.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpuset",
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/golang/glog:go_default_library"],
)
go_test(
name = "go_default_test",
srcs = ["cpuset_test.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/cpuset",
library = ":go_default_library",
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,5 @@
approvers:
- derekwaynecarr
- vishh
- ConnorDoyle
- sjenning

View File

@ -0,0 +1,280 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpuset
import (
"bytes"
"fmt"
"github.com/golang/glog"
"reflect"
"sort"
"strconv"
"strings"
)
// Builder is a mutable builder for CPUSet. Functions that mutate instances
// of this type are not thread-safe.
type Builder struct {
result CPUSet
done bool
}
// NewBuilder returns a mutable CPUSet builder.
func NewBuilder() Builder {
return Builder{
result: CPUSet{
elems: map[int]struct{}{},
},
}
}
// Add adds the supplied elements to the result. Calling Add after calling
// Result has no effect.
func (b Builder) Add(elems ...int) {
if b.done {
return
}
for _, elem := range elems {
b.result.elems[elem] = struct{}{}
}
}
// Result returns the result CPUSet containing all elements that were
// previously added to this builder. Subsequent calls to Add have no effect.
func (b Builder) Result() CPUSet {
b.done = true
return b.result
}
// CPUSet is a thread-safe, immutable set-like data structure for CPU IDs.
type CPUSet struct {
elems map[int]struct{}
}
// NewCPUSet returns a new CPUSet containing the supplied elements.
func NewCPUSet(cpus ...int) CPUSet {
b := NewBuilder()
for _, c := range cpus {
b.Add(c)
}
return b.Result()
}
// Size returns the number of elements in this set.
func (s CPUSet) Size() int {
return len(s.elems)
}
// IsEmpty returns true if there are zero elements in this set.
func (s CPUSet) IsEmpty() bool {
return s.Size() == 0
}
// Contains returns true if the supplied element is present in this set.
func (s CPUSet) Contains(cpu int) bool {
_, found := s.elems[cpu]
return found
}
// Equals returns true if the supplied set contains exactly the same elements
// as this set (s IsSubsetOf s2 and s2 IsSubsetOf s).
func (s CPUSet) Equals(s2 CPUSet) bool {
return reflect.DeepEqual(s.elems, s2.elems)
}
// Filter returns a new CPU set that contains all of the elements from this
// set that match the supplied predicate, without mutating the source set.
func (s CPUSet) Filter(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// FilterNot returns a new CPU set that contains all of the elements from this
// set that do not match the supplied predicate, without mutating the source
// set.
func (s CPUSet) FilterNot(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if !predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// IsSubsetOf returns true if the supplied set contains all the elements
func (s CPUSet) IsSubsetOf(s2 CPUSet) bool {
result := true
for cpu := range s.elems {
if !s2.Contains(cpu) {
result = false
break
}
}
return result
}
// Union returns a new CPU set that contains all of the elements from this
// set and all of the elements from the supplied set, without mutating
// either source set.
func (s CPUSet) Union(s2 CPUSet) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
b.Add(cpu)
}
for cpu := range s2.elems {
b.Add(cpu)
}
return b.Result()
}
// Intersection returns a new CPU set that contains all of the elements
// that are present in both this set and the supplied set, without mutating
// either source set.
func (s CPUSet) Intersection(s2 CPUSet) CPUSet {
return s.Filter(func(cpu int) bool { return s2.Contains(cpu) })
}
// Difference returns a new CPU set that contains all of the elements that
// are present in this set and not the supplied set, without mutating either
// source set.
func (s CPUSet) Difference(s2 CPUSet) CPUSet {
return s.FilterNot(func(cpu int) bool { return s2.Contains(cpu) })
}
// ToSlice returns a slice of integers that contains all elements from
// this set.
func (s CPUSet) ToSlice() []int {
result := []int{}
for cpu := range s.elems {
result = append(result, cpu)
}
sort.Ints(result)
return result
}
// String returns a new string representation of the elements in this CPU set
// in canonical linux CPU list format.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func (s CPUSet) String() string {
if s.IsEmpty() {
return ""
}
elems := s.ToSlice()
type rng struct {
start int
end int
}
ranges := []rng{{elems[0], elems[0]}}
for i := 1; i < len(elems); i++ {
lastRange := &ranges[len(ranges)-1]
// if this element is adjacent to the high end of the last range
if elems[i] == lastRange.end+1 {
// then extend the last range to include this element
lastRange.end = elems[i]
continue
}
// otherwise, start a new range beginning with this element
ranges = append(ranges, rng{elems[i], elems[i]})
}
// construct string from ranges
var result bytes.Buffer
for _, r := range ranges {
if r.start == r.end {
result.WriteString(strconv.Itoa(r.start))
} else {
result.WriteString(fmt.Sprintf("%d-%d", r.start, r.end))
}
result.WriteString(",")
}
return strings.TrimRight(result.String(), ",")
}
// MustParse CPUSet constructs a new CPU set from a Linux CPU list formatted
// string. Unlike Parse, it does not return an error but rather panics if the
// input cannot be used to construct a CPU set.
func MustParse(s string) CPUSet {
res, err := Parse(s)
if err != nil {
glog.Fatalf("unable to parse [%s] as CPUSet: %v", s, err)
}
return res
}
// Parse CPUSet constructs a new CPU set from a Linux CPU list formatted string.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func Parse(s string) (CPUSet, error) {
b := NewBuilder()
// Handle empty string.
if s == "" {
return b.Result(), nil
}
// Split CPU list string:
// "0-5,34,46-48 => ["0-5", "34", "46-48"]
ranges := strings.Split(s, ",")
for _, r := range ranges {
boundaries := strings.Split(r, "-")
if len(boundaries) == 1 {
// Handle ranges that consist of only one element like "34".
elem, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
b.Add(elem)
} else if len(boundaries) == 2 {
// Handle multi-element ranges like "0-5".
start, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
end, err := strconv.Atoi(boundaries[1])
if err != nil {
return NewCPUSet(), err
}
// Add all elements to the result.
// e.g. "0-5", "46-48" => [0, 1, 2, 3, 4, 5, 46, 47, 48].
for e := start; e <= end; e++ {
b.Add(e)
}
}
}
return b.Result(), nil
}
// Clone returns a copy of this CPU set.
func (s CPUSet) Clone() CPUSet {
b := NewBuilder()
for elem := range s.elems {
b.Add(elem)
}
return b.Result()
}

View File

@ -0,0 +1,324 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpuset
import (
"reflect"
"testing"
)
func TestCPUSetBuilder(t *testing.T) {
b := NewBuilder()
elems := []int{1, 2, 3, 4, 5}
for _, elem := range elems {
b.Add(elem)
}
result := b.Result()
for _, elem := range elems {
if !result.Contains(elem) {
t.Fatalf("expected cpuset to contain element %d: [%v]", elem, result)
}
}
if len(elems) != result.Size() {
t.Fatalf("expected cpuset %s to have the same size as %v", result, elems)
}
}
func TestCPUSetSize(t *testing.T) {
testCases := []struct {
cpuset CPUSet
expected int
}{
{NewCPUSet(), 0},
{NewCPUSet(5), 1},
{NewCPUSet(1, 2, 3, 4, 5), 5},
}
for _, c := range testCases {
actual := c.cpuset.Size()
if actual != c.expected {
t.Fatalf("expected: %d, actual: %d, cpuset: [%v]", c.expected, actual, c.cpuset)
}
}
}
func TestCPUSetIsEmpty(t *testing.T) {
testCases := []struct {
cpuset CPUSet
expected bool
}{
{NewCPUSet(), true},
{NewCPUSet(5), false},
{NewCPUSet(1, 2, 3, 4, 5), false},
}
for _, c := range testCases {
actual := c.cpuset.IsEmpty()
if actual != c.expected {
t.Fatalf("expected: %t, IsEmpty() returned: %t, cpuset: [%v]", c.expected, actual, c.cpuset)
}
}
}
func TestCPUSetContains(t *testing.T) {
testCases := []struct {
cpuset CPUSet
mustContain []int
mustNotContain []int
}{
{NewCPUSet(), []int{}, []int{1, 2, 3, 4, 5}},
{NewCPUSet(5), []int{5}, []int{1, 2, 3, 4}},
{NewCPUSet(1, 2, 4, 5), []int{1, 2, 4, 5}, []int{0, 3, 6}},
}
for _, c := range testCases {
for _, elem := range c.mustContain {
if !c.cpuset.Contains(elem) {
t.Fatalf("expected cpuset to contain element %d: [%v]", elem, c.cpuset)
}
}
for _, elem := range c.mustNotContain {
if c.cpuset.Contains(elem) {
t.Fatalf("expected cpuset not to contain element %d: [%v]", elem, c.cpuset)
}
}
}
}
func TestCPUSetEqual(t *testing.T) {
shouldEqual := []struct {
s1 CPUSet
s2 CPUSet
}{
{NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
}
shouldNotEqual := []struct {
s1 CPUSet
s2 CPUSet
}{
{NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet()},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5)},
}
for _, c := range shouldEqual {
if !c.s1.Equals(c.s2) {
t.Fatalf("expected cpusets to be equal: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
for _, c := range shouldNotEqual {
if c.s1.Equals(c.s2) {
t.Fatalf("expected cpusets to not be equal: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
}
func TestCPUSetIsSubsetOf(t *testing.T) {
shouldBeSubset := []struct {
s1 CPUSet
s2 CPUSet
}{
// A set is a subset of itself
{NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
// Empty set is a subset of every set
{NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(2, 3), NewCPUSet(1, 2, 3, 4, 5)},
}
shouldNotBeSubset := []struct {
s1 CPUSet
s2 CPUSet
}{}
for _, c := range shouldBeSubset {
if !c.s1.IsSubsetOf(c.s2) {
t.Fatalf("expected s1 to be a subset of s2: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
for _, c := range shouldNotBeSubset {
if c.s1.IsSubsetOf(c.s2) {
t.Fatalf("expected s1 to not be a subset of s2: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
}
func TestCPUSetUnion(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
}
for _, c := range testCases {
result := c.s1.Union(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the union of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetIntersection(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(3)},
}
for _, c := range testCases {
result := c.s1.Intersection(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the intersection of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetDifference(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(1, 2, 3, 4)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet(1, 2)},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(1, 2)},
}
for _, c := range testCases {
result := c.s1.Difference(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the difference of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetToSlice(t *testing.T) {
testCases := []struct {
set CPUSet
expected []int
}{
{NewCPUSet(), []int{}},
{NewCPUSet(5), []int{5}},
{NewCPUSet(1, 2, 3, 4, 5), []int{1, 2, 3, 4, 5}},
}
for _, c := range testCases {
result := c.set.ToSlice()
if !reflect.DeepEqual(result, c.expected) {
t.Fatalf("expected set as slice to be [%v] (got [%v]), s: [%v]", c.expected, result, c.set)
}
}
}
func TestCPUSetString(t *testing.T) {
testCases := []struct {
set CPUSet
expected string
}{
{NewCPUSet(), ""},
{NewCPUSet(5), "5"},
{NewCPUSet(1, 2, 3, 4, 5), "1-5"},
{NewCPUSet(1, 2, 3, 5, 6, 8), "1-3,5-6,8"},
}
for _, c := range testCases {
result := c.set.String()
if result != c.expected {
t.Fatalf("expected set as string to be %s (got \"%s\"), s: [%v]", c.expected, result, c.set)
}
}
}
func TestParse(t *testing.T) {
testCases := []struct {
cpusetString string
expected CPUSet
}{
{"", NewCPUSet()},
{"5", NewCPUSet(5)},
{"1,2,3,4,5", NewCPUSet(1, 2, 3, 4, 5)},
{"1-5", NewCPUSet(1, 2, 3, 4, 5)},
{"1-2,3-5", NewCPUSet(1, 2, 3, 4, 5)},
}
for _, c := range testCases {
result, err := Parse(c.cpusetString)
if err != nil {
t.Fatalf("expected error not to have occurred: %v", err)
}
if !result.Equals(c.expected) {
t.Fatalf("expected string \"%s\" to parse as [%v] (got [%v])", c.cpusetString, c.expected, result)
}
}
}

View File

@ -0,0 +1,70 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
"go_test",
)
go_library(
name = "go_default_library",
srcs = [
"device_plugin_stub.go",
"endpoint.go",
"manager.go",
"manager_stub.go",
"pod_devices.go",
"types.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/deviceplugin",
deps = [
"//pkg/apis/core/v1/helper:go_default_library",
"//pkg/kubelet/apis/deviceplugin/v1alpha:go_default_library",
"//pkg/kubelet/config:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/golang.org/x/net/context:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)
go_test(
name = "go_default_test",
srcs = [
"endpoint_test.go",
"manager_test.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/deviceplugin",
library = ":go_default_library",
deps = [
"//pkg/kubelet/apis/deviceplugin/v1alpha:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/github.com/stretchr/testify/require:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
],
)

View File

@ -0,0 +1,6 @@
approvers:
- jiayingz
- vishh
reviewers:
- mindprince
- RenaudWasTaken

View File

@ -0,0 +1,158 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"log"
"net"
"os"
"path"
"time"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
)
// Stub implementation for DevicePlugin.
type Stub struct {
devs []*pluginapi.Device
socket string
stop chan interface{}
update chan []*pluginapi.Device
server *grpc.Server
}
// NewDevicePluginStub returns an initialized DevicePlugin Stub.
func NewDevicePluginStub(devs []*pluginapi.Device, socket string) *Stub {
return &Stub{
devs: devs,
socket: socket,
stop: make(chan interface{}),
update: make(chan []*pluginapi.Device),
}
}
// Start starts the gRPC server of the device plugin
func (m *Stub) Start() error {
err := m.cleanup()
if err != nil {
return err
}
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
go m.server.Serve(sock)
// Wait till grpc server is ready.
for i := 0; i < 10; i++ {
services := m.server.GetServiceInfo()
if len(services) > 1 {
break
}
time.Sleep(1 * time.Second)
}
log.Println("Starting to serve on", m.socket)
return nil
}
// Stop stops the gRPC server
func (m *Stub) Stop() error {
m.server.Stop()
close(m.stop)
return m.cleanup()
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *Stub) Register(kubeletEndpoint, resourceName string) error {
conn, err := grpc.Dial(kubeletEndpoint, grpc.WithInsecure(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}))
defer conn.Close()
if err != nil {
return err
}
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: path.Base(m.socket),
ResourceName: resourceName,
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
return err
}
return nil
}
// ListAndWatch lists devices and update that list according to the Update call
func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
log.Println("ListAndWatch")
var devs []*pluginapi.Device
for _, d := range m.devs {
devs = append(devs, &pluginapi.Device{
ID: d.ID,
Health: pluginapi.Healthy,
})
}
s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
for {
select {
case <-m.stop:
return nil
case updated := <-m.update:
s.Send(&pluginapi.ListAndWatchResponse{Devices: updated})
}
}
}
// Update allows the device plugin to send new devices through ListAndWatch
func (m *Stub) Update(devs []*pluginapi.Device) {
m.update <- devs
}
// Allocate does a mock allocation
func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
log.Printf("Allocate, %+v", r)
var response pluginapi.AllocateResponse
return &response, nil
}
func (m *Stub) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}

View File

@ -0,0 +1,198 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"fmt"
"net"
"sync"
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
)
// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
run()
stop()
allocate(devs []string) (*pluginapi.AllocateResponse, error)
getDevices() []pluginapi.Device
callback(resourceName string, added, updated, deleted []pluginapi.Device)
}
type endpointImpl struct {
client pluginapi.DevicePluginClient
clientConn *grpc.ClientConn
socketPath string
resourceName string
devices map[string]pluginapi.Device
mutex sync.Mutex
cb monitorCallback
}
// newEndpoint creates a new endpoint for the given resourceName.
func newEndpointImpl(socketPath, resourceName string, devices map[string]pluginapi.Device, callback monitorCallback) (*endpointImpl, error) {
client, c, err := dial(socketPath)
if err != nil {
glog.Errorf("Can't create new endpoint with path %s err %v", socketPath, err)
return nil, err
}
return &endpointImpl{
client: client,
clientConn: c,
socketPath: socketPath,
resourceName: resourceName,
devices: devices,
cb: callback,
}, nil
}
func (e *endpointImpl) callback(resourceName string, added, updated, deleted []pluginapi.Device) {
e.cb(resourceName, added, updated, deleted)
}
func (e *endpointImpl) getDevices() []pluginapi.Device {
e.mutex.Lock()
defer e.mutex.Unlock()
var devs []pluginapi.Device
for _, d := range e.devices {
devs = append(devs, d)
}
return devs
}
// run initializes ListAndWatch gRPC call for the device plugin and
// blocks on receiving ListAndWatch gRPC stream updates. Each ListAndWatch
// stream update contains a new list of device states. listAndWatch compares the new
// device states with its cached states to get list of new, updated, and deleted devices.
// It then issues a callback to pass this information to the device manager which
// will adjust the resource available information accordingly.
func (e *endpointImpl) run() {
stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
if err != nil {
glog.Errorf(errListAndWatch, e.resourceName, err)
return
}
devices := make(map[string]pluginapi.Device)
e.mutex.Lock()
for _, d := range e.devices {
devices[d.ID] = d
}
e.mutex.Unlock()
for {
response, err := stream.Recv()
if err != nil {
glog.Errorf(errListAndWatch, e.resourceName, err)
return
}
devs := response.Devices
glog.V(2).Infof("State pushed for device plugin %s", e.resourceName)
newDevs := make(map[string]*pluginapi.Device)
var added, updated []pluginapi.Device
for _, d := range devs {
dOld, ok := devices[d.ID]
newDevs[d.ID] = d
if !ok {
glog.V(2).Infof("New device for Endpoint %s: %v", e.resourceName, d)
devices[d.ID] = *d
added = append(added, *d)
continue
}
if d.Health == dOld.Health {
continue
}
if d.Health == pluginapi.Unhealthy {
glog.Errorf("Device %s is now Unhealthy", d.ID)
} else if d.Health == pluginapi.Healthy {
glog.V(2).Infof("Device %s is now Healthy", d.ID)
}
devices[d.ID] = *d
updated = append(updated, *d)
}
var deleted []pluginapi.Device
for id, d := range devices {
if _, ok := newDevs[id]; ok {
continue
}
glog.Errorf("Device %s was deleted", d.ID)
deleted = append(deleted, d)
delete(devices, id)
}
e.mutex.Lock()
e.devices = devices
e.mutex.Unlock()
e.callback(e.resourceName, added, updated, deleted)
}
}
// allocate issues Allocate gRPC call to the device plugin.
func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
return e.client.Allocate(context.Background(), &pluginapi.AllocateRequest{
DevicesIDs: devs,
})
}
func (e *endpointImpl) stop() {
e.clientConn.Close()
}
// dial establishes the gRPC communication with the registered device plugin.
func dial(unixSocketPath string) (pluginapi.DevicePluginClient, *grpc.ClientConn, error) {
c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
}),
)
if err != nil {
return nil, nil, fmt.Errorf(errFailedToDialDevicePlugin+" %v", err)
}
return pluginapi.NewDevicePluginClient(c), c, nil
}

View File

@ -0,0 +1,114 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"path"
"testing"
"time"
"github.com/stretchr/testify/require"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
)
var (
esocketName = "mock.sock"
)
func TestNewEndpoint(t *testing.T) {
socket := path.Join("/tmp", esocketName)
devs := []*pluginapi.Device{
{ID: "ADeviceId", Health: pluginapi.Healthy},
}
p, e := esetup(t, devs, socket, "mock", func(n string, a, u, r []pluginapi.Device) {})
defer ecleanup(t, p, e)
}
func TestRun(t *testing.T) {
socket := path.Join("/tmp", esocketName)
devs := []*pluginapi.Device{
{ID: "ADeviceId", Health: pluginapi.Healthy},
{ID: "AnotherDeviceId", Health: pluginapi.Healthy},
}
updated := []*pluginapi.Device{
{ID: "ADeviceId", Health: pluginapi.Unhealthy},
{ID: "AThirdDeviceId", Health: pluginapi.Healthy},
}
p, e := esetup(t, devs, socket, "mock", func(n string, a, u, r []pluginapi.Device) {
require.Len(t, a, 1)
require.Len(t, u, 1)
require.Len(t, r, 1)
require.Equal(t, a[0].ID, updated[1].ID)
require.Equal(t, u[0].ID, updated[0].ID)
require.Equal(t, u[0].Health, updated[0].Health)
require.Equal(t, r[0].ID, devs[1].ID)
})
defer ecleanup(t, p, e)
go e.run()
p.Update(updated)
time.Sleep(time.Second)
e.mutex.Lock()
defer e.mutex.Unlock()
require.Len(t, e.devices, 2)
for _, dref := range updated {
d, ok := e.devices[dref.ID]
require.True(t, ok)
require.Equal(t, d.ID, dref.ID)
require.Equal(t, d.Health, dref.Health)
}
}
func TestGetDevices(t *testing.T) {
e := endpointImpl{
devices: map[string]pluginapi.Device{
"ADeviceId": {ID: "ADeviceId", Health: pluginapi.Healthy},
},
}
devs := e.getDevices()
require.Len(t, devs, 1)
}
func esetup(t *testing.T, devs []*pluginapi.Device, socket, resourceName string, callback monitorCallback) (*Stub, *endpointImpl) {
p := NewDevicePluginStub(devs, socket)
err := p.Start()
require.NoError(t, err)
e, err := newEndpointImpl(socket, "mock", make(map[string]pluginapi.Device), func(n string, a, u, r []pluginapi.Device) {})
require.NoError(t, err)
return p, e
}
func ecleanup(t *testing.T, p *Stub, e *endpointImpl) {
p.Stop()
e.stop()
}

View File

@ -0,0 +1,646 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"encoding/json"
"fmt"
"io/ioutil"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/golang/glog"
"golang.org/x/net/context"
"google.golang.org/grpc"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
// monitorCallback is the function called when a device's health state changes,
// or new devices are reported, or old devices are deleted.
// Updated contains the most recent state of the Device.
type monitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
// ManagerImpl is the structure in charge of managing Device Plugins.
type ManagerImpl struct {
socketname string
socketdir string
endpoints map[string]endpoint // Key is ResourceName
mutex sync.Mutex
server *grpc.Server
// activePods is a method for listing active pods on the node
// so the amount of pluginResources requested by existing pods
// could be counted when updating allocated devices
activePods ActivePodsFunc
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config.SourcesReady
// callback is used for updating devices' states in one time call.
// e.g. a new device is advertised, two old devices are deleted and a running device fails.
callback monitorCallback
// allDevices contains all of registered resourceNames and their exported device IDs.
allDevices map[string]sets.String
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
allocatedDevices map[string]sets.String
// podDevices contains pod to allocated device mapping.
podDevices podDevices
}
type sourcesReadyStub struct{}
func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManagerImpl creates a new manager.
func NewManagerImpl() (*ManagerImpl, error) {
return newManagerImpl(pluginapi.KubeletSocket)
}
func newManagerImpl(socketPath string) (*ManagerImpl, error) {
glog.V(2).Infof("Creating Device Plugin manager at %s", socketPath)
if socketPath == "" || !filepath.IsAbs(socketPath) {
return nil, fmt.Errorf(errBadSocket+" %v", socketPath)
}
dir, file := filepath.Split(socketPath)
manager := &ManagerImpl{
endpoints: make(map[string]endpoint),
socketname: file,
socketdir: dir,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
}
manager.callback = manager.genericDeviceUpdateCallback
// The following structs are populated with real implementations in manager.Start()
// Before that, initializes them to perform no-op operations.
manager.activePods = func() []*v1.Pod { return []*v1.Pod{} }
manager.sourcesReady = &sourcesReadyStub{}
return manager, nil
}
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, added, updated, deleted []pluginapi.Device) {
kept := append(updated, added...)
m.mutex.Lock()
if _, ok := m.allDevices[resourceName]; !ok {
m.allDevices[resourceName] = sets.NewString()
}
// For now, Manager only keeps track of healthy devices.
// TODO: adds support to track unhealthy devices.
for _, dev := range kept {
if dev.Health == pluginapi.Healthy {
m.allDevices[resourceName].Insert(dev.ID)
} else {
m.allDevices[resourceName].Delete(dev.ID)
}
}
for _, dev := range deleted {
m.allDevices[resourceName].Delete(dev.ID)
}
m.mutex.Unlock()
m.writeCheckpoint()
}
func (m *ManagerImpl) removeContents(dir string) error {
d, err := os.Open(dir)
if err != nil {
return err
}
defer d.Close()
names, err := d.Readdirnames(-1)
if err != nil {
return err
}
for _, name := range names {
filePath := filepath.Join(dir, name)
if filePath == m.checkpointFile() {
continue
}
stat, err := os.Stat(filePath)
if err != nil {
glog.Errorf("Failed to stat file %v: %v", filePath, err)
continue
}
if stat.IsDir() {
continue
}
err = os.RemoveAll(filePath)
if err != nil {
return err
}
}
return nil
}
const (
// kubeletDevicePluginCheckpoint is the file name of device plugin checkpoint
kubeletDevicePluginCheckpoint = "kubelet_internal_checkpoint"
)
// checkpointFile returns device plugin checkpoint file path.
func (m *ManagerImpl) checkpointFile() string {
return filepath.Join(m.socketdir, kubeletDevicePluginCheckpoint)
}
// Start starts the Device Plugin Manager amd start initialization of
// podDevices and allocatedDevices information from checkpoint-ed state and
// starts device plugin registration service.
func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error {
glog.V(2).Infof("Starting Device Plugin manager")
m.activePods = activePods
m.sourcesReady = sourcesReady
// Loads in allocatedDevices information from disk.
err := m.readCheckpoint()
if err != nil {
glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
}
socketPath := filepath.Join(m.socketdir, m.socketname)
os.MkdirAll(m.socketdir, 0755)
// Removes all stale sockets in m.socketdir. Device plugins can monitor
// this and use it as a signal to re-register with the new Kubelet.
if err := m.removeContents(m.socketdir); err != nil {
glog.Errorf("Fail to clean up stale contents under %s: %+v", m.socketdir, err)
}
s, err := net.Listen("unix", socketPath)
if err != nil {
glog.Errorf(errListenSocket+" %+v", err)
return err
}
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterRegistrationServer(m.server, m)
go m.server.Serve(s)
glog.V(2).Infof("Serving device plugin registration server on %q", socketPath)
return nil
}
// Devices is the map of devices that are known by the Device
// Plugin manager with the kind of the devices as key
func (m *ManagerImpl) Devices() map[string][]pluginapi.Device {
m.mutex.Lock()
defer m.mutex.Unlock()
devs := make(map[string][]pluginapi.Device)
for k, e := range m.endpoints {
glog.V(3).Infof("Endpoint: %+v: %p", k, e)
devs[k] = e.getDevices()
}
return devs
}
// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
pod := attrs.Pod
devicesToReuse := make(map[string]sets.String)
// TODO: Reuse devices between init containers and regular containers.
for _, container := range pod.Spec.InitContainers {
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
return err
}
m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
}
for _, container := range pod.Spec.Containers {
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
return err
}
m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
}
m.mutex.Lock()
defer m.mutex.Unlock()
// quick return if no pluginResources requested
if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
return nil
}
m.sanitizeNodeAllocatable(node)
return nil
}
// Register registers a device plugin.
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
if r.Version != pluginapi.Version {
errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
return &pluginapi.Empty{}, fmt.Errorf(errorString)
}
if !v1helper.IsExtendedResourceName(v1.ResourceName(r.ResourceName)) {
errorString := fmt.Sprintf(errInvalidResourceName, r.ResourceName)
glog.Infof("Bad registration request from device plugin: %v", errorString)
return &pluginapi.Empty{}, fmt.Errorf(errorString)
}
// TODO: for now, always accepts newest device plugin. Later may consider to
// add some policies here, e.g., verify whether an old device plugin with the
// same resource name is still alive to determine whether we want to accept
// the new registration.
go m.addEndpoint(r)
return &pluginapi.Empty{}, nil
}
// Stop is the function that can stop the gRPC server.
func (m *ManagerImpl) Stop() error {
m.mutex.Lock()
defer m.mutex.Unlock()
for _, e := range m.endpoints {
e.stop()
}
m.server.Stop()
return nil
}
func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
existingDevs := make(map[string]pluginapi.Device)
m.mutex.Lock()
old, ok := m.endpoints[r.ResourceName]
if ok && old != nil {
// Pass devices of previous endpoint into re-registered one,
// to avoid potential orphaned devices upon re-registration
devices := make(map[string]pluginapi.Device)
for _, device := range old.getDevices() {
devices[device.ID] = device
}
existingDevs = devices
}
m.mutex.Unlock()
socketPath := filepath.Join(m.socketdir, r.Endpoint)
e, err := newEndpointImpl(socketPath, r.ResourceName, existingDevs, m.callback)
if err != nil {
glog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
return
}
m.mutex.Lock()
// Check for potential re-registration during the initialization of new endpoint,
// and skip updating if re-registration happens.
// TODO: simplify the part once we have a better way to handle registered devices
ext := m.endpoints[r.ResourceName]
if ext != old {
glog.Warningf("Some other endpoint %v is added while endpoint %v is initialized", ext, e)
m.mutex.Unlock()
e.stop()
return
}
// Associates the newly created endpoint with the corresponding resource name.
// Stops existing endpoint if there is any.
m.endpoints[r.ResourceName] = e
glog.V(2).Infof("Registered endpoint %v", e)
m.mutex.Unlock()
if old != nil {
old.stop()
}
go func() {
e.run()
e.stop()
m.mutex.Lock()
if old, ok := m.endpoints[r.ResourceName]; ok && old == e {
glog.V(2).Infof("Delete resource for endpoint %v", e)
delete(m.endpoints, r.ResourceName)
}
glog.V(2).Infof("Unregistered endpoint %v", e)
m.mutex.Unlock()
}()
}
// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains previously registered resources that are no longer active.
// Kubelet uses this information to update resource capacity/allocatable in its node status.
// After the call, device plugin can remove the inactive resources from its internal list as the
// change is already reflected in Kubelet node status.
// Note in the special case after Kubelet restarts, device plugin resource capacities can
// temporarily drop to zero till corresponding device plugins re-register. This is OK because
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
// capacity for already allocated pods so that they can continue to run. However, new pods
// requiring device plugin resources will not be scheduled till device plugin re-registers.
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, []string) {
needsUpdateCheckpoint := false
var capacity = v1.ResourceList{}
var deletedResources []string
m.mutex.Lock()
for resourceName, devices := range m.allDevices {
if _, ok := m.endpoints[resourceName]; !ok {
delete(m.allDevices, resourceName)
deletedResources = append(deletedResources, resourceName)
needsUpdateCheckpoint = true
} else {
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
}
}
m.mutex.Unlock()
if needsUpdateCheckpoint {
m.writeCheckpoint()
}
return capacity, deletedResources
}
// checkpointData struct is used to store pod to device allocation information
// and registered device information in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []podDevicesCheckpointEntry
RegisteredDevices map[string][]string
}
// Checkpoints device to container allocation information to disk.
func (m *ManagerImpl) writeCheckpoint() error {
m.mutex.Lock()
data := checkpointData{
PodDeviceEntries: m.podDevices.toCheckpointData(),
RegisteredDevices: make(map[string][]string),
}
for resource, devices := range m.allDevices {
data.RegisteredDevices[resource] = devices.UnsortedList()
}
m.mutex.Unlock()
dataJSON, err := json.Marshal(data)
if err != nil {
return err
}
filepath := m.checkpointFile()
return ioutil.WriteFile(filepath, dataJSON, 0644)
}
// Reads device to container allocation information from disk, and populates
// m.allocatedDevices accordingly.
func (m *ManagerImpl) readCheckpoint() error {
filepath := m.checkpointFile()
content, err := ioutil.ReadFile(filepath)
if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
}
glog.V(2).Infof("Read checkpoint file %s\n", filepath)
var data checkpointData
if err := json.Unmarshal(content, &data); err != nil {
return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
}
m.mutex.Lock()
defer m.mutex.Unlock()
m.podDevices.fromCheckpointData(data.PodDeviceEntries)
m.allocatedDevices = m.podDevices.devices()
for resource, devices := range data.RegisteredDevices {
m.allDevices[resource] = sets.NewString()
for _, dev := range devices {
m.allDevices[resource].Insert(dev)
}
}
return nil
}
// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
// terminated pods. Returns error on failure.
func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
if !m.sourcesReady.AllReady() {
return
}
m.mutex.Lock()
defer m.mutex.Unlock()
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
allocatedPodUids := m.podDevices.pods()
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
if len(podsToBeRemoved) <= 0 {
return
}
glog.V(3).Infof("pods to be removed: %v", podsToBeRemoved.List())
m.podDevices.delete(podsToBeRemoved.List())
// Regenerated allocatedDevices after we update pod allocation information.
m.allocatedDevices = m.podDevices.devices()
}
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
m.mutex.Lock()
defer m.mutex.Unlock()
needed := required
// Gets list of devices that have already been allocated.
// This can happen if a container restarts for example.
devices := m.podDevices.containerDevices(podUID, contName, resource)
if devices != nil {
glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
needed = needed - devices.Len()
// A pod's resource is not expected to change once admitted by the API server,
// so just fail loudly here. We can revisit this part if this no longer holds.
if needed != 0 {
return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
}
}
if needed == 0 {
// No change, no work.
return nil, nil
}
glog.V(3).Infof("Needs to allocate %v %v for pod %q container %q", needed, resource, podUID, contName)
// Needs to allocate additional devices.
if _, ok := m.allDevices[resource]; !ok {
return nil, fmt.Errorf("can't allocate unregistered device %v", resource)
}
devices = sets.NewString()
// Allocates from reusableDevices list first.
for device := range reusableDevices {
devices.Insert(device)
needed--
if needed == 0 {
return devices, nil
}
}
// Needs to allocate additional devices.
if m.allocatedDevices[resource] == nil {
m.allocatedDevices[resource] = sets.NewString()
}
// Gets Devices in use.
devicesInUse := m.allocatedDevices[resource]
// Gets a list of available devices.
available := m.allDevices[resource].Difference(devicesInUse)
if int(available.Len()) < needed {
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
}
allocated := available.UnsortedList()[:needed]
// Updates m.allocatedDevices with allocated devices to prevent them
// from being allocated to other pods/containers, given that we are
// not holding lock during the rpc call.
for _, device := range allocated {
m.allocatedDevices[resource].Insert(device)
devices.Insert(device)
}
return devices, nil
}
// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container, devicesToReuse map[string]sets.String) error {
podUID := string(pod.UID)
contName := container.Name
allocatedDevicesUpdated := false
for k, v := range container.Resources.Limits {
resource := string(k)
needed := int(v.Value())
glog.V(3).Infof("needs %d %s", needed, resource)
_, registeredResource := m.allDevices[resource]
_, allocatedResource := m.allocatedDevices[resource]
// Continues if this is neither an active device plugin resource nor
// a resource we have previously allocated.
if !registeredResource && !allocatedResource {
continue
}
// Updates allocatedDevices to garbage collect any stranded resources
// before doing the device plugin allocation.
if !allocatedDevicesUpdated {
m.updateAllocatedDevices(m.activePods())
allocatedDevicesUpdated = true
}
allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])
if err != nil {
return err
}
if allocDevices == nil || len(allocDevices) <= 0 {
continue
}
startRPCTime := time.Now()
// devicePluginManager.Allocate involves RPC calls to device plugin, which
// could be heavy-weight. Therefore we want to perform this operation outside
// mutex lock. Note if Allocate call fails, we may leave container resources
// partially allocated for the failed container. We rely on updateAllocatedDevices()
// to garbage collect these resources later. Another side effect is that if
// we have X resource A and Y resource B in total, and two containers, container1
// and container2 both require X resource A and Y resource B. Both allocation
// requests may fail if we serve them in mixed order.
// TODO: may revisit this part later if we see inefficient resource allocation
// in real use as the result of this. Should also consider to parallize device
// plugin Allocate grpc calls if it becomes common that a container may require
// resources from multiple device plugins.
m.mutex.Lock()
e, ok := m.endpoints[resource]
m.mutex.Unlock()
if !ok {
m.mutex.Lock()
m.allocatedDevices = m.podDevices.devices()
m.mutex.Unlock()
return fmt.Errorf("Unknown Device Plugin %s", resource)
}
devs := allocDevices.UnsortedList()
glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
resp, err := e.allocate(devs)
metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
if err != nil {
// In case of allocation failure, we want to restore m.allocatedDevices
// to the actual allocated state from m.podDevices.
m.mutex.Lock()
m.allocatedDevices = m.podDevices.devices()
m.mutex.Unlock()
return err
}
// Update internal cached podDevices state.
m.mutex.Lock()
m.podDevices.insert(podUID, contName, resource, allocDevices, resp)
m.mutex.Unlock()
}
// Checkpoints device to container allocation information.
return m.writeCheckpoint()
}
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
m.mutex.Lock()
defer m.mutex.Unlock()
return m.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name)
}
// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
var newAllocatableResource *schedulercache.Resource
allocatableResource := node.AllocatableResource()
if allocatableResource.ScalarResources == nil {
allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
}
for resource, devices := range m.allocatedDevices {
needed := devices.Len()
quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
if ok && int(quant) >= needed {
continue
}
// Needs to update nodeInfo.AllocatableResource to make sure
// NodeInfo.allocatableResource at least equal to the capacity already allocated.
if newAllocatableResource == nil {
newAllocatableResource = allocatableResource.Clone()
}
newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
}
if newAllocatableResource != nil {
node.SetAllocatableResource(newAllocatableResource)
}
}

View File

@ -0,0 +1,63 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"k8s.io/api/core/v1"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
// ManagerStub provides a simple stub implementation for the Device Manager.
type ManagerStub struct{}
// NewManagerStub creates a ManagerStub.
func NewManagerStub() (*ManagerStub, error) {
return &ManagerStub{}, nil
}
// Start simply returns nil.
func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error {
return nil
}
// Stop simply returns nil.
func (h *ManagerStub) Stop() error {
return nil
}
// Devices returns an empty map.
func (h *ManagerStub) Devices() map[string][]pluginapi.Device {
return make(map[string][]pluginapi.Device)
}
// Allocate simply returns nil.
func (h *ManagerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return nil
}
// GetDeviceRunContainerOptions simply returns nil.
func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
return nil
}
// GetCapacity simply returns nil capacity and empty removed resource list.
func (h *ManagerStub) GetCapacity() (v1.ResourceList, []string) {
return nil, []string{}
}

View File

@ -0,0 +1,658 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"flag"
"fmt"
"io/ioutil"
"os"
"reflect"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
const (
socketName = "/tmp/device_plugin/server.sock"
pluginSocketName = "/tmp/device_plugin/device-plugin.sock"
testResourceName = "fake-domain/resource"
)
func TestNewManagerImpl(t *testing.T) {
_, err := newManagerImpl(socketName)
require.NoError(t, err)
}
func TestNewManagerImplStart(t *testing.T) {
m, p := setup(t, []*pluginapi.Device{}, func(n string, a, u, r []pluginapi.Device) {})
cleanup(t, m, p)
}
// Tests that the device plugin manager correctly handles registration and re-registration by
// making sure that after registration, devices are correctly updated and if a re-registration
// happens, we will NOT delete devices; and no orphaned devices left.
func TestDevicePluginReRegistration(t *testing.T) {
devs := []*pluginapi.Device{
{ID: "Dev1", Health: pluginapi.Healthy},
{ID: "Dev2", Health: pluginapi.Healthy},
}
devsForRegistration := []*pluginapi.Device{
{ID: "Dev3", Health: pluginapi.Healthy},
}
callbackCount := 0
callbackChan := make(chan int)
var stopping int32
stopping = 0
callback := func(n string, a, u, r []pluginapi.Device) {
// Should be called three times, one for each plugin registration, till we are stopping.
if callbackCount > 2 && atomic.LoadInt32(&stopping) <= 0 {
t.FailNow()
}
callbackCount++
callbackChan <- callbackCount
}
m, p1 := setup(t, devs, callback)
p1.Register(socketName, testResourceName)
// Wait for the first callback to be issued.
<-callbackChan
// Wait till the endpoint is added to the manager.
for i := 0; i < 20; i++ {
if len(m.Devices()) > 0 {
break
}
time.Sleep(1)
}
devices := m.Devices()
require.Equal(t, 2, len(devices[testResourceName]), "Devices are not updated.")
p2 := NewDevicePluginStub(devs, pluginSocketName+".new")
err := p2.Start()
require.NoError(t, err)
p2.Register(socketName, testResourceName)
// Wait for the second callback to be issued.
<-callbackChan
devices2 := m.Devices()
require.Equal(t, 2, len(devices2[testResourceName]), "Devices shouldn't change.")
// Test the scenario that a plugin re-registers with different devices.
p3 := NewDevicePluginStub(devsForRegistration, pluginSocketName+".third")
err = p3.Start()
require.NoError(t, err)
p3.Register(socketName, testResourceName)
// Wait for the second callback to be issued.
<-callbackChan
devices3 := m.Devices()
require.Equal(t, 1, len(devices3[testResourceName]), "Devices of plugin previously registered should be removed.")
// Wait long enough to catch unexpected callbacks.
time.Sleep(5 * time.Second)
atomic.StoreInt32(&stopping, 1)
p2.Stop()
p3.Stop()
cleanup(t, m, p1)
}
func setup(t *testing.T, devs []*pluginapi.Device, callback monitorCallback) (Manager, *Stub) {
m, err := newManagerImpl(socketName)
require.NoError(t, err)
m.callback = callback
activePods := func() []*v1.Pod {
return []*v1.Pod{}
}
err = m.Start(activePods, &sourcesReadyStub{})
require.NoError(t, err)
p := NewDevicePluginStub(devs, pluginSocketName)
err = p.Start()
require.NoError(t, err)
return m, p
}
func cleanup(t *testing.T, m Manager, p *Stub) {
p.Stop()
m.Stop()
}
func TestUpdateCapacity(t *testing.T) {
testManager, err := newManagerImpl(socketName)
as := assert.New(t)
as.NotNil(testManager)
as.Nil(err)
devs := []pluginapi.Device{
{ID: "Device1", Health: pluginapi.Healthy},
{ID: "Device2", Health: pluginapi.Healthy},
{ID: "Device3", Health: pluginapi.Unhealthy},
}
callback := testManager.genericDeviceUpdateCallback
// Adds three devices for resource1, two healthy and one unhealthy.
// Expects capacity for resource1 to be 2.
resourceName1 := "domain1.com/resource1"
testManager.endpoints[resourceName1] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
callback(resourceName1, devs, []pluginapi.Device{}, []pluginapi.Device{})
capacity, removedResources := testManager.GetCapacity()
resource1Capacity, ok := capacity[v1.ResourceName(resourceName1)]
as.True(ok)
as.Equal(int64(2), resource1Capacity.Value())
as.Equal(0, len(removedResources))
// Deletes an unhealthy device should NOT change capacity.
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
capacity, removedResources = testManager.GetCapacity()
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
as.True(ok)
as.Equal(int64(2), resource1Capacity.Value())
as.Equal(0, len(removedResources))
// Updates a healthy device to unhealthy should reduce capacity by 1.
dev2 := devs[1]
dev2.Health = pluginapi.Unhealthy
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{dev2}, []pluginapi.Device{})
capacity, removedResources = testManager.GetCapacity()
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
as.True(ok)
as.Equal(int64(1), resource1Capacity.Value())
as.Equal(0, len(removedResources))
// Deletes a healthy device should reduce capacity by 1.
callback(resourceName1, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[0]})
capacity, removedResources = testManager.GetCapacity()
resource1Capacity, ok = capacity[v1.ResourceName(resourceName1)]
as.True(ok)
as.Equal(int64(0), resource1Capacity.Value())
as.Equal(0, len(removedResources))
// Tests adding another resource.
resourceName2 := "resource2"
testManager.endpoints[resourceName2] = &endpointImpl{devices: make(map[string]pluginapi.Device)}
callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
capacity, removedResources = testManager.GetCapacity()
as.Equal(2, len(capacity))
resource2Capacity, ok := capacity[v1.ResourceName(resourceName2)]
as.True(ok)
as.Equal(int64(2), resource2Capacity.Value())
as.Equal(0, len(removedResources))
// Removes resourceName1 endpoint. Verifies testManager.GetCapacity() reports that resourceName1
// is removed from capacity and it no longer exists in allDevices after the call.
delete(testManager.endpoints, resourceName1)
capacity, removed := testManager.GetCapacity()
as.Equal([]string{resourceName1}, removed)
_, ok = capacity[v1.ResourceName(resourceName1)]
as.False(ok)
val, ok := capacity[v1.ResourceName(resourceName2)]
as.True(ok)
as.Equal(int64(2), val.Value())
_, ok = testManager.allDevices[resourceName1]
as.False(ok)
}
type stringPairType struct {
value1 string
value2 string
}
func constructDevices(devices []string) sets.String {
ret := sets.NewString()
for _, dev := range devices {
ret.Insert(dev)
}
return ret
}
func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.AllocateResponse {
resp := &pluginapi.AllocateResponse{}
for k, v := range devices {
resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
HostPath: k,
ContainerPath: v,
Permissions: "mrw",
})
}
for k, v := range mounts {
resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
ContainerPath: k,
HostPath: v,
ReadOnly: true,
})
}
resp.Envs = make(map[string]string)
for k, v := range envs {
resp.Envs[k] = v
}
return resp
}
func TestCheckpoint(t *testing.T) {
resourceName1 := "domain1.com/resource1"
resourceName2 := "domain2.com/resource2"
as := assert.New(t)
tmpDir, err := ioutil.TempDir("", "checkpoint")
as.Nil(err)
defer os.RemoveAll(tmpDir)
testManager := &ManagerImpl{
socketdir: tmpDir,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
}
testManager.podDevices.insert("pod1", "con1", resourceName1,
constructDevices([]string{"dev1", "dev2"}),
constructAllocResp(map[string]string{"/dev/r1dev1": "/dev/r1dev1", "/dev/r1dev2": "/dev/r1dev2"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
testManager.podDevices.insert("pod1", "con1", resourceName2,
constructDevices([]string{"dev1", "dev2"}),
constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"},
map[string]string{"/home/r2lib1": "/usr/r2lib1"},
map[string]string{"r2devices": "dev1 dev2"}))
testManager.podDevices.insert("pod1", "con2", resourceName1,
constructDevices([]string{"dev3"}),
constructAllocResp(map[string]string{"/dev/r1dev3": "/dev/r1dev3"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
testManager.podDevices.insert("pod2", "con1", resourceName1,
constructDevices([]string{"dev4"}),
constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
testManager.allDevices[resourceName1] = sets.NewString()
testManager.allDevices[resourceName1].Insert("dev1")
testManager.allDevices[resourceName1].Insert("dev2")
testManager.allDevices[resourceName1].Insert("dev3")
testManager.allDevices[resourceName1].Insert("dev4")
testManager.allDevices[resourceName1].Insert("dev5")
testManager.allDevices[resourceName2] = sets.NewString()
testManager.allDevices[resourceName2].Insert("dev1")
testManager.allDevices[resourceName2].Insert("dev2")
expectedPodDevices := testManager.podDevices
expectedAllocatedDevices := testManager.podDevices.devices()
expectedAllDevices := testManager.allDevices
err = testManager.writeCheckpoint()
as.Nil(err)
testManager.podDevices = make(podDevices)
err = testManager.readCheckpoint()
as.Nil(err)
as.Equal(len(expectedPodDevices), len(testManager.podDevices))
for podUID, containerDevices := range expectedPodDevices {
for conName, resources := range containerDevices {
for resource := range resources {
as.True(reflect.DeepEqual(
expectedPodDevices.containerDevices(podUID, conName, resource),
testManager.podDevices.containerDevices(podUID, conName, resource)))
opts1 := expectedPodDevices.deviceRunContainerOptions(podUID, conName)
opts2 := testManager.podDevices.deviceRunContainerOptions(podUID, conName)
as.Equal(len(opts1.Envs), len(opts2.Envs))
as.Equal(len(opts1.Mounts), len(opts2.Mounts))
as.Equal(len(opts1.Devices), len(opts2.Devices))
}
}
}
as.True(reflect.DeepEqual(expectedAllocatedDevices, testManager.allocatedDevices))
as.True(reflect.DeepEqual(expectedAllDevices, testManager.allDevices))
}
type activePodsStub struct {
activePods []*v1.Pod
}
func (a *activePodsStub) getActivePods() []*v1.Pod {
return a.activePods
}
func (a *activePodsStub) updateActivePods(newPods []*v1.Pod) {
a.activePods = newPods
}
type MockEndpoint struct {
allocateFunc func(devs []string) (*pluginapi.AllocateResponse, error)
}
func (m *MockEndpoint) stop() {}
func (m *MockEndpoint) run() {}
func (m *MockEndpoint) getDevices() []pluginapi.Device {
return []pluginapi.Device{}
}
func (m *MockEndpoint) callback(resourceName string, added, updated, deleted []pluginapi.Device) {}
func (m *MockEndpoint) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
if m.allocateFunc != nil {
return m.allocateFunc(devs)
}
return nil, nil
}
func TestPodContainerDeviceAllocation(t *testing.T) {
flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
var logLevel string
flag.StringVar(&logLevel, "logLevel", "4", "test")
flag.Lookup("v").Value.Set(logLevel)
resourceName1 := "domain1.com/resource1"
resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
devID1 := "dev1"
devID2 := "dev2"
resourceName2 := "domain2.com/resource2"
resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
devID3 := "dev3"
devID4 := "dev4"
as := require.New(t)
monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
podsStub := activePodsStub{
activePods: []*v1.Pod{},
}
cachedNode := &v1.Node{
Status: v1.NodeStatus{
Allocatable: v1.ResourceList{},
},
}
nodeInfo := &schedulercache.NodeInfo{}
nodeInfo.SetNode(cachedNode)
tmpDir, err := ioutil.TempDir("", "checkpoint")
as.Nil(err)
defer os.RemoveAll(tmpDir)
testManager := &ManagerImpl{
socketdir: tmpDir,
callback: monitorCallback,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
endpoints: make(map[string]endpoint),
podDevices: make(podDevices),
activePods: podsStub.getActivePods,
sourcesReady: &sourcesReadyStub{},
}
testManager.allDevices[resourceName1] = sets.NewString()
testManager.allDevices[resourceName1].Insert(devID1)
testManager.allDevices[resourceName1].Insert(devID2)
testManager.allDevices[resourceName2] = sets.NewString()
testManager.allDevices[resourceName2].Insert(devID3)
testManager.allDevices[resourceName2].Insert(devID4)
testManager.endpoints[resourceName1] = &MockEndpoint{
allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
resp := new(pluginapi.AllocateResponse)
resp.Envs = make(map[string]string)
for _, dev := range devs {
switch dev {
case "dev1":
resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
ContainerPath: "/dev/aaa",
HostPath: "/dev/aaa",
Permissions: "mrw",
})
resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
ContainerPath: "/dev/bbb",
HostPath: "/dev/bbb",
Permissions: "mrw",
})
resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
ContainerPath: "/container_dir1/file1",
HostPath: "host_dir1/file1",
ReadOnly: true,
})
case "dev2":
resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
ContainerPath: "/dev/ccc",
HostPath: "/dev/ccc",
Permissions: "mrw",
})
resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
ContainerPath: "/container_dir1/file2",
HostPath: "host_dir1/file2",
ReadOnly: true,
})
resp.Envs["key1"] = "val1"
}
}
return resp, nil
},
}
testManager.endpoints[resourceName2] = &MockEndpoint{
allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
resp := new(pluginapi.AllocateResponse)
resp.Envs = make(map[string]string)
for _, dev := range devs {
switch dev {
case "dev3":
resp.Envs["key2"] = "val2"
case "dev4":
resp.Envs["key2"] = "val3"
}
}
return resp, nil
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity1,
v1.ResourceName("cpu"): resourceQuantity1,
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
},
},
}
podsStub.updateActivePods([]*v1.Pod{pod})
err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: pod})
as.Nil(err)
runContainerOpts := testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
as.NotNil(runContainerOpts)
as.Equal(len(runContainerOpts.Devices), 3)
as.Equal(len(runContainerOpts.Mounts), 2)
as.Equal(len(runContainerOpts.Envs), 2)
// Requesting to create a pod without enough resources should fail.
as.Equal(2, testManager.allocatedDevices[resourceName1].Len())
failPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity2,
},
},
},
},
},
}
err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: failPod})
as.NotNil(err)
runContainerOpts2 := testManager.GetDeviceRunContainerOptions(failPod, &failPod.Spec.Containers[0])
as.Nil(runContainerOpts2)
// Requesting to create a new pod with a single resourceName2 should succeed.
newPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
},
},
}
err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: newPod})
as.Nil(err)
runContainerOpts3 := testManager.GetDeviceRunContainerOptions(newPod, &newPod.Spec.Containers[0])
as.Equal(1, len(runContainerOpts3.Envs))
// Requesting to create a pod that requests resourceName1 in init containers and normal containers
// should succeed with devices allocated to init containers reallocated to normal containers.
podWithPluginResourcesInInitContainers := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
InitContainers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity2,
},
},
},
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity1,
},
},
},
},
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity2,
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity2,
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
},
},
}
podsStub.updateActivePods([]*v1.Pod{podWithPluginResourcesInInitContainers})
err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: podWithPluginResourcesInInitContainers})
as.Nil(err)
podUID := string(podWithPluginResourcesInInitContainers.UID)
initCont1 := podWithPluginResourcesInInitContainers.Spec.InitContainers[0].Name
initCont2 := podWithPluginResourcesInInitContainers.Spec.InitContainers[1].Name
normalCont1 := podWithPluginResourcesInInitContainers.Spec.Containers[0].Name
normalCont2 := podWithPluginResourcesInInitContainers.Spec.Containers[1].Name
initCont1Devices := testManager.podDevices.containerDevices(podUID, initCont1, resourceName1)
initCont2Devices := testManager.podDevices.containerDevices(podUID, initCont2, resourceName1)
normalCont1Devices := testManager.podDevices.containerDevices(podUID, normalCont1, resourceName1)
normalCont2Devices := testManager.podDevices.containerDevices(podUID, normalCont2, resourceName1)
as.True(initCont2Devices.IsSuperset(initCont1Devices))
as.True(initCont2Devices.IsSuperset(normalCont1Devices))
as.True(initCont2Devices.IsSuperset(normalCont2Devices))
as.Equal(0, normalCont1Devices.Intersection(normalCont2Devices).Len())
}
func TestSanitizeNodeAllocatable(t *testing.T) {
resourceName1 := "domain1.com/resource1"
devID1 := "dev1"
resourceName2 := "domain2.com/resource2"
devID2 := "dev2"
as := assert.New(t)
monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
testManager := &ManagerImpl{
callback: monitorCallback,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
}
// require one of resource1 and one of resource2
testManager.allocatedDevices[resourceName1] = sets.NewString()
testManager.allocatedDevices[resourceName1].Insert(devID1)
testManager.allocatedDevices[resourceName2] = sets.NewString()
testManager.allocatedDevices[resourceName2].Insert(devID2)
cachedNode := &v1.Node{
Status: v1.NodeStatus{
Allocatable: v1.ResourceList{
// has no resource1 and two of resource2
v1.ResourceName(resourceName2): *resource.NewQuantity(int64(2), resource.DecimalSI),
},
},
}
nodeInfo := &schedulercache.NodeInfo{}
nodeInfo.SetNode(cachedNode)
testManager.sanitizeNodeAllocatable(nodeInfo)
allocatableScalarResources := nodeInfo.AllocatableResource().ScalarResources
// allocatable in nodeInfo is less than needed, should update
as.Equal(1, int(allocatableScalarResources[v1.ResourceName(resourceName1)]))
// allocatable in nodeInfo is more than needed, should skip updating
as.Equal(2, int(allocatableScalarResources[v1.ResourceName(resourceName2)]))
}

View File

@ -0,0 +1,257 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/sets"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type deviceAllocateInfo struct {
// deviceIds contains device Ids allocated to this container for the given resourceName.
deviceIds sets.String
// allocResp contains cached rpc AllocateResponse.
allocResp *pluginapi.AllocateResponse
}
type resourceAllocateInfo map[string]deviceAllocateInfo // Keyed by resourceName.
type containerDevices map[string]resourceAllocateInfo // Keyed by containerName.
type podDevices map[string]containerDevices // Keyed by podUID.
func (pdev podDevices) pods() sets.String {
ret := sets.NewString()
for k := range pdev {
ret.Insert(k)
}
return ret
}
func (pdev podDevices) insert(podUID, contName, resource string, devices sets.String, resp *pluginapi.AllocateResponse) {
if _, podExists := pdev[podUID]; !podExists {
pdev[podUID] = make(containerDevices)
}
if _, contExists := pdev[podUID][contName]; !contExists {
pdev[podUID][contName] = make(resourceAllocateInfo)
}
pdev[podUID][contName][resource] = deviceAllocateInfo{
deviceIds: devices,
allocResp: resp,
}
}
func (pdev podDevices) delete(pods []string) {
for _, uid := range pods {
delete(pdev, uid)
}
}
// Returns list of device Ids allocated to the given container for the given resource.
// Returns nil if we don't have cached state for the given <podUID, contName, resource>.
func (pdev podDevices) containerDevices(podUID, contName, resource string) sets.String {
if _, podExists := pdev[podUID]; !podExists {
return nil
}
if _, contExists := pdev[podUID][contName]; !contExists {
return nil
}
devs, resourceExists := pdev[podUID][contName][resource]
if !resourceExists {
return nil
}
return devs.deviceIds
}
// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
func (pdev podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
containers, exists := pdev[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds)
}
}
// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
func (pdev podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
containers, exists := pdev[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds)
}
}
// Returns all of devices allocated to the pods being tracked, keyed by resourceName.
func (pdev podDevices) devices() map[string]sets.String {
ret := make(map[string]sets.String)
for _, containerDevices := range pdev {
for _, resources := range containerDevices {
for resource, devices := range resources {
if _, exists := ret[resource]; !exists {
ret[resource] = sets.NewString()
}
ret[resource] = ret[resource].Union(devices.deviceIds)
}
}
}
return ret
}
// podDevicesCheckpointEntry is used to record <pod, container> to device allocation information.
type podDevicesCheckpointEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs []string
AllocResp []byte
}
// Turns podDevices to checkpointData.
func (pdev podDevices) toCheckpointData() []podDevicesCheckpointEntry {
var data []podDevicesCheckpointEntry
for podUID, containerDevices := range pdev {
for conName, resources := range containerDevices {
for resource, devices := range resources {
devIds := devices.deviceIds.UnsortedList()
if devices.allocResp == nil {
glog.Errorf("Can't marshal allocResp for %v %v %v: allocation response is missing", podUID, conName, resource)
continue
}
allocResp, err := devices.allocResp.Marshal()
if err != nil {
glog.Errorf("Can't marshal allocResp for %v %v %v: %v", podUID, conName, resource, err)
continue
}
data = append(data, podDevicesCheckpointEntry{podUID, conName, resource, devIds, allocResp})
}
}
}
return data
}
// Populates podDevices from the passed in checkpointData.
func (pdev podDevices) fromCheckpointData(data []podDevicesCheckpointEntry) {
for _, entry := range data {
glog.V(2).Infof("Get checkpoint entry: %v %v %v %v %v\n",
entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceIDs, entry.AllocResp)
devIDs := sets.NewString()
for _, devID := range entry.DeviceIDs {
devIDs.Insert(devID)
}
allocResp := &pluginapi.AllocateResponse{}
err := allocResp.Unmarshal(entry.AllocResp)
if err != nil {
glog.Errorf("Can't unmarshal allocResp for %v %v %v: %v", entry.PodUID, entry.ContainerName, entry.ResourceName, err)
continue
}
pdev.insert(entry.PodUID, entry.ContainerName, entry.ResourceName, devIDs, allocResp)
}
}
// Returns combined container runtime settings to consume the container's allocated devices.
func (pdev podDevices) deviceRunContainerOptions(podUID, contName string) *DeviceRunContainerOptions {
containers, exists := pdev[podUID]
if !exists {
return nil
}
resources, exists := containers[contName]
if !exists {
return nil
}
opts := &DeviceRunContainerOptions{}
// Maps to detect duplicate settings.
devsMap := make(map[string]string)
mountsMap := make(map[string]string)
envsMap := make(map[string]string)
// Loops through AllocationResponses of all cached device resources.
for _, devices := range resources {
resp := devices.allocResp
// Each Allocate response has the following artifacts.
// Environment variables
// Mount points
// Device files
// These artifacts are per resource per container.
// Updates RunContainerOptions.Envs.
for k, v := range resp.Envs {
if e, ok := envsMap[k]; ok {
glog.V(3).Infof("skip existing env %s %s", k, v)
if e != v {
glog.Errorf("Environment variable %s has conflicting setting: %s and %s", k, e, v)
}
continue
}
glog.V(4).Infof("add env %s %s", k, v)
envsMap[k] = v
opts.Envs = append(opts.Envs, kubecontainer.EnvVar{Name: k, Value: v})
}
// Updates RunContainerOptions.Devices.
for _, dev := range resp.Devices {
if d, ok := devsMap[dev.ContainerPath]; ok {
glog.V(3).Infof("skip existing device %s %s", dev.ContainerPath, dev.HostPath)
if d != dev.HostPath {
glog.Errorf("Container device %s has conflicting mapping host devices: %s and %s",
dev.ContainerPath, d, dev.HostPath)
}
continue
}
glog.V(4).Infof("add device %s %s", dev.ContainerPath, dev.HostPath)
devsMap[dev.ContainerPath] = dev.HostPath
opts.Devices = append(opts.Devices, kubecontainer.DeviceInfo{
PathOnHost: dev.HostPath,
PathInContainer: dev.ContainerPath,
Permissions: dev.Permissions,
})
}
// Updates RunContainerOptions.Mounts.
for _, mount := range resp.Mounts {
if m, ok := mountsMap[mount.ContainerPath]; ok {
glog.V(3).Infof("skip existing mount %s %s", mount.ContainerPath, mount.HostPath)
if m != mount.HostPath {
glog.Errorf("Container mount %s has conflicting mapping host mounts: %s and %s",
mount.ContainerPath, m, mount.HostPath)
}
continue
}
glog.V(4).Infof("add mount %s %s", mount.ContainerPath, mount.HostPath)
mountsMap[mount.ContainerPath] = mount.HostPath
opts.Mounts = append(opts.Mounts, kubecontainer.Mount{
Name: mount.ContainerPath,
ContainerPath: mount.ContainerPath,
HostPath: mount.HostPath,
ReadOnly: mount.ReadOnly,
// TODO: This may need to be part of Device plugin API.
SELinuxRelabel: false,
})
}
}
return opts
}

View File

@ -0,0 +1,96 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package deviceplugin
import (
"k8s.io/api/core/v1"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)
// Manager manages all the Device Plugins running on a node.
type Manager interface {
// Start starts device plugin registration service.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error
// Devices is the map of devices that have registered themselves
// against the manager.
// The map key is the ResourceName of the device plugins.
Devices() map[string][]pluginapi.Device
// Allocate configures and assigns devices to pods. The pods are provided
// through the pod admission attributes in the attrs argument. From the
// requested device resources, Allocate will communicate with the owning
// device plugin to allow setup procedures to take place, and for the
// device plugin to provide runtime settings to use the device (environment
// variables, mount points and device files). The node object is provided
// for the device manager to update the node capacity to reflect the
// currently available devices.
Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
// Stop stops the manager.
Stop() error
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
// GetCapacity returns the amount of available device plugin resource capacity
// and inactive device plugin resources previously registered on the node.
GetCapacity() (v1.ResourceList, []string)
}
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
type DeviceRunContainerOptions struct {
// The environment variables list.
Envs []kubecontainer.EnvVar
// The mounts for the container.
Mounts []kubecontainer.Mount
// The host devices mapped into the container.
Devices []kubecontainer.DeviceInfo
}
// TODO: evaluate whether we need these error definitions.
const (
// errFailedToDialDevicePlugin is the error raised when the device plugin could not be
// reached on the registered socket
errFailedToDialDevicePlugin = "failed to dial device plugin:"
// errUnsuportedVersion is the error raised when the device plugin uses an API version not
// supported by the Kubelet registry
errUnsuportedVersion = "requested API version %q is not supported by kubelet. Supported version is %q"
// errDevicePluginAlreadyExists is the error raised when a device plugin with the
// same Resource Name tries to register itself
errDevicePluginAlreadyExists = "another device plugin already registered this Resource Name"
// errInvalidResourceName is the error raised when a device plugin is registering
// itself with an invalid ResourceName
errInvalidResourceName = "the ResourceName %q is invalid"
// errEmptyResourceName is the error raised when the resource name field is empty
errEmptyResourceName = "invalid Empty ResourceName"
// errBadSocket is the error raised when the registry socket path is not absolute
errBadSocket = "bad socketPath, must be an absolute path:"
// errRemoveSocket is the error raised when the registry could not remove the existing socket
errRemoveSocket = "failed to remove socket while starting device plugin registry, with error"
// errListenSocket is the error raised when the registry could not listen on the socket
errListenSocket = "failed to listen to socket while starting device plugin registry, with error"
// errListAndWatch is the error raised when ListAndWatch ended unsuccessfully
errListAndWatch = "listAndWatch ended unexpectedly for device plugin %s with error %v"
)

View File

@ -0,0 +1,39 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
)
func NewFakeInternalContainerLifecycle() *fakeInternalContainerLifecycle {
return &fakeInternalContainerLifecycle{}
}
type fakeInternalContainerLifecycle struct{}
func (f *fakeInternalContainerLifecycle) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PreStopContainer(containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PostStopContainer(containerID string) error {
return nil
}

View File

@ -0,0 +1,230 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/api/v1/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
)
const (
// Taken from lmctfy https://github.com/google/lmctfy/blob/master/lmctfy/controllers/cpu_controller.cc
MinShares = 2
SharesPerCPU = 1024
MilliCPUToCPU = 1000
// 100000 is equivalent to 100ms
QuotaPeriod = 100000
MinQuotaPeriod = 1000
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64) (quota int64, period uint64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
// so in the above example, you are limited to 20% of a single CPU
// for multi-cpu environments, you just scale equivalent amounts
if milliCPU == 0 {
return
}
// we set the period to 100ms by default
period = QuotaPeriod
// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * QuotaPeriod) / MilliCPUToCPU
// quota needs to be a minimum of 1ms.
if quota < MinQuotaPeriod {
quota = MinQuotaPeriod
}
return
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
if milliCPU == 0 {
// Docker converts zero milliCPU to unset, which maps to kernel default
// for unset: 1024. Return 2 here to really match kernel default for
// zero milliCPU.
return MinShares
}
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
if shares < MinShares {
return MinShares
}
return uint64(shares)
}
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// sum requests and limits.
reqs, limits := resource.PodRequestsAndLimits(pod)
cpuRequests := int64(0)
cpuLimits := int64(0)
memoryLimits := int64(0)
if request, found := reqs[v1.ResourceCPU]; found {
cpuRequests = request.MilliValue()
}
if limit, found := limits[v1.ResourceCPU]; found {
cpuLimits = limit.MilliValue()
}
if limit, found := limits[v1.ResourceMemory]; found {
memoryLimits = limit.Value()
}
// convert to CFS values
cpuShares := MilliCPUToShares(cpuRequests)
cpuQuota, cpuPeriod := MilliCPUToQuota(cpuLimits)
// track if limits were applied for each resource.
memoryLimitsDeclared := true
cpuLimitsDeclared := true
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := map[int64]int64{}
for _, container := range pod.Spec.Containers {
if container.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = false
}
if container.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = false
}
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
for k, v := range containerHugePageLimits {
if value, exists := hugePageLimits[k]; exists {
hugePageLimits[k] = value + v
} else {
hugePageLimits[k] = v
}
}
}
// determine the qos class
qosClass := v1qos.GetPodQOS(pod)
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
result.CpuShares = &cpuShares
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
result.Memory = &memoryLimits
} else if qosClass == v1.PodQOSBurstable {
result.CpuShares = &cpuShares
if cpuLimitsDeclared {
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
}
if memoryLimitsDeclared {
result.Memory = &memoryLimits
}
} else {
shares := uint64(MinShares)
result.CpuShares = &shares
}
result.HugePageLimit = hugePageLimits
return result
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
// get all cgroup mounts.
allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
if err != nil {
return &CgroupSubsystems{}, err
}
if len(allCgroups) == 0 {
return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
}
mountPoints := make(map[string]string, len(allCgroups))
for _, mount := range allCgroups {
for _, subsystem := range mount.Subsystems {
mountPoints[subsystem] = mount.Mountpoint
}
}
return &CgroupSubsystems{
Mounts: allCgroups,
MountPoints: mountPoints,
}, nil
}
// getCgroupProcs takes a cgroup directory name as an argument
// reads through the cgroup's procs file and returns a list of tgid's.
// It returns an empty list if a procs file doesn't exists
func getCgroupProcs(dir string) ([]int, error) {
procsFile := filepath.Join(dir, "cgroup.procs")
f, err := os.Open(procsFile)
if err != nil {
if os.IsNotExist(err) {
// The procsFile does not exist, So no pids attached to this directory
return []int{}, nil
}
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
out := []int{}
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
}
out = append(out, pid)
}
}
return out, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return podCgroupNamePrefix + string(podUID)
}

View File

@ -0,0 +1,199 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
// getResourceList returns a ResourceList with the
// specified cpu and memory resource values
func getResourceList(cpu, memory string) v1.ResourceList {
res := v1.ResourceList{}
if cpu != "" {
res[v1.ResourceCPU] = resource.MustParse(cpu)
}
if memory != "" {
res[v1.ResourceMemory] = resource.MustParse(memory)
}
return res
}
// getResourceRequirements returns a ResourceRequirements object
func getResourceRequirements(requests, limits v1.ResourceList) v1.ResourceRequirements {
res := v1.ResourceRequirements{}
res.Requests = requests
res.Limits = limits
return res
}
func TestResourceConfigForPod(t *testing.T) {
minShares := uint64(MinShares)
burstableShares := MilliCPUToShares(100)
memoryQuantity := resource.MustParse("200Mi")
burstableMemory := memoryQuantity.Value()
burstablePartialShares := MilliCPUToShares(200)
burstableQuota, burstablePeriod := MilliCPUToQuota(200)
guaranteedShares := MilliCPUToShares(100)
guaranteedQuota, guaranteedPeriod := MilliCPUToQuota(100)
memoryQuantity = resource.MustParse("100Mi")
guaranteedMemory := memoryQuantity.Value()
testCases := map[string]struct {
pod *v1.Pod
expected *ResourceConfig
}{
"besteffort": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &minShares},
},
"burstable-no-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares},
},
"burstable-with-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &burstablePeriod, Memory: &burstableMemory},
},
"burstable-partial-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstablePartialShares},
},
"guaranteed": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedPeriod, Memory: &guaranteedMemory},
},
}
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod)
if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) {
t.Errorf("unexpected result, test: %v, cpu period not as expected", testName)
}
if !reflect.DeepEqual(actual.CpuQuota, testCase.expected.CpuQuota) {
t.Errorf("unexpected result, test: %v, cpu quota not as expected", testName)
}
if !reflect.DeepEqual(actual.CpuShares, testCase.expected.CpuShares) {
t.Errorf("unexpected result, test: %v, cpu shares not as expected", testName)
}
if !reflect.DeepEqual(actual.Memory, testCase.expected.Memory) {
t.Errorf("unexpected result, test: %v, memory not as expected", testName)
}
}
}
func TestMilliCPUToQuota(t *testing.T) {
testCases := []struct {
input int64
quota int64
period uint64
}{
{
input: int64(0),
quota: int64(0),
period: uint64(0),
},
{
input: int64(5),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(9),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(10),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(200),
quota: int64(20000),
period: uint64(100000),
},
{
input: int64(500),
quota: int64(50000),
period: uint64(100000),
},
{
input: int64(1000),
quota: int64(100000),
period: uint64(100000),
},
{
input: int64(1500),
quota: int64(150000),
period: uint64(100000),
},
}
for _, testCase := range testCases {
quota, period := MilliCPUToQuota(testCase.input)
if quota != testCase.quota || period != testCase.period {
t.Errorf("Input %v, expected quota %v period %v, but got quota %v period %v", testCase.input, testCase.quota, testCase.period, quota, period)
}
}
}

View File

@ -0,0 +1,62 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
const (
MinShares = 0
SharesPerCPU = 0
MilliCPUToCPU = 0
QuotaPeriod = 0
MinQuotaPeriod = 0
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64) (int64, int64) {
return 0, 0
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) int64 {
return 0
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
return nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
return nil, nil
}
func getCgroupProcs(dir string) ([]int, error) {
return nil, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return ""
}

View File

@ -0,0 +1,57 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
)
type InternalContainerLifecycle interface {
PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error
PreStopContainer(containerID string) error
PostStopContainer(containerID string) error
}
// Implements InternalContainerLifecycle interface.
type internalContainerLifecycleImpl struct {
cpuManager cpumanager.Manager
}
func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.AddContainer(pod, container, containerID)
}
return nil
}
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.RemoveContainer(containerID)
}
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.RemoveContainer(containerID)
}
return nil
}

View File

@ -0,0 +1,252 @@
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"strings"
"time"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const (
defaultNodeAllocatableCgroupName = "kubepods"
)
func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: getCgroupConfig(cm.capacity),
}
if cm.cgroupManager.Exists(cgroupConfig.Name) {
return nil
}
if err := cm.cgroupManager.Create(cgroupConfig); err != nil {
glog.Errorf("Failed to create %q cgroup", cm.cgroupRoot)
return err
}
return nil
}
// enforceNodeAllocatableCgroups enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
nc := cm.NodeConfig.NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.capacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableAbsolute()
}
glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc)
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
ResourceParameters: getCgroupConfig(nodeAllocatable),
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
nodeRef := &v1.ObjectReference{
Kind: "Node",
Name: cm.nodeInfo.Name,
UID: types.UID(cm.nodeInfo.Name),
Namespace: "",
}
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher that current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
if cm.cgroupRoot != "/" {
go func() {
for {
err := cm.cgroupManager.Update(cgroupConfig)
if err == nil {
cm.recorder.Event(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated Node Allocatable limit across pods")
return
}
message := fmt.Sprintf("Failed to update Node Allocatable Limits %q: %v", cm.cgroupRoot, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
time.Sleep(time.Minute)
}
}()
}
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(SystemReservedEnforcementKey) {
glog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(KubeReservedEnforcementKey) {
glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
return nil
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cName),
ResourceParameters: getCgroupConfig(rl),
}
glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory)
if !cgroupManager.Exists(cgroupConfig.Name) {
return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name)
}
if err := cgroupManager.Update(cgroupConfig); err != nil {
return err
}
return nil
}
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
// TODO(vishh): Set CPU Quota if necessary.
if rl == nil {
return nil
}
var rc ResourceConfig
if q, exists := rl[v1.ResourceMemory]; exists {
// Memory is defined in bytes.
val := q.Value()
rc.Memory = &val
}
if q, exists := rl[v1.ResourceCPU]; exists {
// CPU is defined in milli-cores.
val := MilliCPUToShares(q.MilliValue())
rc.CpuShares = &val
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc
}
// getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
result := make(v1.ResourceList)
for k, v := range cm.capacity {
value := *(v.Copy())
if cm.NodeConfig.SystemReserved != nil {
value.Sub(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Sub(cm.NodeConfig.KubeReserved[k])
}
if value.Sign() < 0 {
// Negative Allocatable resources don't make sense.
value.Set(0)
}
result[k] = value
}
return result
}
// GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling.
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.NodeConfig.SystemReserved != nil {
value.Add(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Add(cm.NodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds.
func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList {
if len(thresholds) == 0 {
return nil
}
ret := v1.ResourceList{}
for _, threshold := range thresholds {
if threshold.Operator != evictionapi.OpLessThan {
continue
}
switch threshold.Signal {
case evictionapi.SignalMemoryAvailable:
memoryCapacity := capacity[v1.ResourceMemory]
value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity)
ret[v1.ResourceMemory] = *value
case evictionapi.SignalNodeFsAvailable:
storageCapacity := capacity[v1.ResourceEphemeralStorage]
value := evictionapi.GetThresholdQuantity(threshold.Value, &storageCapacity)
ret[v1.ResourceEphemeralStorage] = *value
}
}
return ret
}
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
var errors []string
nar := cm.GetNodeAllocatableReservation()
for k, v := range nar {
value := cm.capacity[k].DeepCopy()
value.Sub(v)
if value.Sign() < 0 {
errors = append(errors, fmt.Sprintf("Resource %q has an allocatable of %v, capacity of %v", k, v, value))
}
}
if len(errors) > 0 {
return fmt.Errorf("Invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
}
return nil
}

View File

@ -0,0 +1,406 @@
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
func TestNodeAllocatableReservationForScheduling(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
cpuMemCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
expected v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("150m", "150Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("150m", "250Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
expected: getResourceList("150m", "694157320"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("", ""),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("50m", "150Mi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("50m", "150Mi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
expected: getResourceList("", "150Mi"),
},
}
for idx, tc := range cpuMemCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.GetNodeAllocatableReservation() {
expected, exists := tc.expected[k]
assert.True(t, exists, "test case %d expected resource %q", idx+1, k)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)
}
}
ephemeralStorageEvictionThreshold := resource.MustParse("100Mi")
ephemeralStorageTestCases := []struct {
kubeReserved v1.ResourceList
expected v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
}{
{
kubeReserved: getEphemeralStorageResourceList("100Mi"),
capacity: getEphemeralStorageResourceList("10Gi"),
expected: getEphemeralStorageResourceList("100Mi"),
},
{
kubeReserved: getEphemeralStorageResourceList("100Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &ephemeralStorageEvictionThreshold,
},
capacity: getEphemeralStorageResourceList("10Gi"),
expected: getEphemeralStorageResourceList("200Mi"),
},
{
kubeReserved: getEphemeralStorageResourceList("150Mi"),
capacity: getEphemeralStorageResourceList("10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
expected: getEphemeralStorageResourceList("694157320"),
},
{
kubeReserved: v1.ResourceList{},
capacity: getEphemeralStorageResourceList("10Gi"),
expected: getEphemeralStorageResourceList(""),
},
}
for idx, tc := range ephemeralStorageTestCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.GetNodeAllocatableReservation() {
expected, exists := tc.expected[k]
assert.True(t, exists, "test case %d expected resource %q", idx+1, k)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)
}
}
}
func TestNodeAllocatableForEnforcement(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
testCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
capacity v1.ResourceList
expected v1.ResourceList
hardThreshold evictionapi.ThresholdValue
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9950m", "10090Mi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9950m", "10090Mi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
expected: getResourceList("10", ""),
},
}
for idx, tc := range testCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.getNodeAllocatableAbsolute() {
expected, exists := tc.expected[k]
assert.True(t, exists)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)
}
}
}
func TestNodeAllocatableInputValidation(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
highMemoryEvictionThreshold := resource.MustParse("2Gi")
cpuMemTestCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
invalidConfiguration bool
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
},
{
kubeReserved: getResourceList("5", "10Gi"),
systemReserved: getResourceList("5", "10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &highMemoryEvictionThreshold,
},
capacity: getResourceList("10", "11Gi"),
invalidConfiguration: true,
},
}
for _, tc := range cpuMemTestCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
err := cm.validateNodeAllocatable()
if err == nil && tc.invalidConfiguration {
t.Logf("Expected invalid node allocatable configuration")
t.FailNow()
} else if err != nil && !tc.invalidConfiguration {
t.Logf("Expected valid node allocatable configuration: %v", err)
t.FailNow()
}
}
ephemeralStorageEvictionThreshold := resource.MustParse("100Mi")
ephemeralStorageTestCases := []struct {
kubeReserved v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
invalidConfiguration bool
}{
{
kubeReserved: getEphemeralStorageResourceList("100Mi"),
capacity: getEphemeralStorageResourceList("500Mi"),
},
{
kubeReserved: getEphemeralStorageResourceList("20Gi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &ephemeralStorageEvictionThreshold,
},
capacity: getEphemeralStorageResourceList("20Gi"),
invalidConfiguration: true,
},
}
for _, tc := range ephemeralStorageTestCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
err := cm.validateNodeAllocatable()
if err == nil && tc.invalidConfiguration {
t.Logf("Expected invalid node allocatable configuration")
t.FailNow()
} else if err != nil && !tc.invalidConfiguration {
t.Logf("Expected valid node allocatable configuration: %v", err)
t.FailNow()
}
}
}
// getEphemeralStorageResourceList returns a ResourceList with the
// specified ephemeral storage resource values
func getEphemeralStorageResourceList(storage string) v1.ResourceList {
res := v1.ResourceList{}
if storage != "" {
res[v1.ResourceEphemeralStorage] = resource.MustParse(storage)
}
return res
}

View File

@ -0,0 +1,271 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
)
const (
podCgroupNamePrefix = "pod"
)
// podContainerManagerImpl implements podContainerManager interface.
// It is the general implementation which allows pod level container
// management if qos Cgroup is enabled.
type podContainerManagerImpl struct {
// qosContainersInfo hold absolute paths of the top level qos containers
qosContainersInfo QOSContainersInfo
// Stores the mounted cgroup subsystems
subsystems *CgroupSubsystems
// cgroupManager is the cgroup Manager Object responsible for managing all
// pod cgroups.
cgroupManager CgroupManager
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerImpl{}
// applyLimits sets pod cgroup resource limits
// It also updates the resource limits on top level qos containers.
func (m *podContainerManagerImpl) applyLimits(pod *v1.Pod) error {
// This function will house the logic for setting the resource parameters
// on the pod container config and updating top level qos container configs
return nil
}
// Exists checks if the pod's cgroup already exists
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
podContainerName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.Exists(podContainerName)
}
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod level container doesn't already exist it is created.
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
podContainerName, _ := m.GetPodContainerName(pod)
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
// Create the pod container
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod),
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
}
}
// Apply appropriate resource limits on the pod container
// Top level qos containers limits are not updated
// until we figure how to maintain the desired state in the kubelet.
// Because maintaining the desired state is difficult without checkpointing.
if err := m.applyLimits(pod); err != nil {
return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
}
return nil
}
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
podQOS := v1qos.GetPodQOS(pod)
// Get the parent QOS container name
var parentContainer string
switch podQOS {
case v1.PodQOSGuaranteed:
parentContainer = m.qosContainersInfo.Guaranteed
case v1.PodQOSBurstable:
parentContainer = m.qosContainersInfo.Burstable
case v1.PodQOSBestEffort:
parentContainer = m.qosContainersInfo.BestEffort
}
podContainer := GetPodCgroupNameSuffix(pod.UID)
// Get the absolute path of the cgroup
cgroupName := (CgroupName)(path.Join(parentContainer, podContainer))
// Get the literal cgroupfs name
cgroupfsName := m.cgroupManager.Name(cgroupName)
return cgroupName, cgroupfsName
}
// Scan through the whole cgroup directory and kill all processes either
// attached to the pod cgroup or to a container cgroup under the pod cgroup
func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
pidsToKill := m.cgroupManager.Pids(podCgroup)
// No pids charged to the terminated pod cgroup return
if len(pidsToKill) == 0 {
return nil
}
var errlist []error
// os.Kill often errors out,
// We try killing all the pids multiple times
for i := 0; i < 5; i++ {
if i != 0 {
glog.V(3).Infof("Attempt %v failed to kill all unwanted process. Retyring", i)
}
errlist = []error{}
for _, pid := range pidsToKill {
p, err := os.FindProcess(pid)
if err != nil {
// Process not running anymore, do nothing
continue
}
glog.V(3).Infof("Attempt to kill process with pid: %v", pid)
if err := p.Kill(); err != nil {
glog.V(3).Infof("failed to kill process with pid: %v", pid)
errlist = append(errlist, err)
}
}
if len(errlist) == 0 {
glog.V(3).Infof("successfully killed all unwanted processes.")
return nil
}
}
return utilerrors.NewAggregate(errlist)
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
// Try killing all the processes attached to the pod cgroup
if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
glog.V(3).Infof("failed to kill all the processes attached to the %v cgroups", podCgroup)
return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
}
// Now its safe to remove the pod's cgroup
containerConfig := &CgroupConfig{
Name: podCgroup,
ResourceParameters: &ResourceConfig{},
}
if err := m.cgroupManager.Destroy(containerConfig); err != nil {
return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
}
return nil
}
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
return m.cgroupManager.ReduceCPULimits(podCgroup)
}
// GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
// Get list of pods whose cgroup still exist on the cgroup mounts
func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
// Map for storing all the found pods on the disk
foundPods := make(map[types.UID]CgroupName)
qosContainersList := [3]string{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
// Scan through all the subsystem mounts
// and through each QoS cgroup directory for each subsystem mount
// If a pod cgroup exists in even a single subsystem mount
// we will attempt to delete it
for _, val := range m.subsystems.MountPoints {
for _, qosContainerName := range qosContainersList {
// get the subsystems QoS cgroup absolute name
qcConversion := m.cgroupManager.Name(CgroupName(qosContainerName))
qc := path.Join(val, qcConversion)
dirInfo, err := ioutil.ReadDir(qc)
if err != nil {
if os.IsNotExist(err) {
continue
}
return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
}
for i := range dirInfo {
// its not a directory, so continue on...
if !dirInfo[i].IsDir() {
continue
}
// convert the concrete cgroupfs name back to an internal identifier
// this is needed to handle path conversion for systemd environments.
// we pass the fully qualified path so decoding can work as expected
// since systemd encodes the path in each segment.
cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
// we only care about base segment of the converted path since that
// is what we are reading currently to know if it is a pod or not.
basePath := path.Base(string(internalPath))
if !strings.Contains(basePath, podCgroupNamePrefix) {
continue
}
// we then split the name on the pod prefix to determine the uid
parts := strings.Split(basePath, podCgroupNamePrefix)
// the uid is missing, so we log the unexpected cgroup not of form pod<uid>
if len(parts) != 2 {
glog.Errorf("pod cgroup manager ignoring unexpected cgroup %v because it is not a pod", cgroupfsPath)
continue
}
podUID := parts[1]
foundPods[types.UID(podUID)] = internalPath
}
}
}
return foundPods, nil
}
// podContainerManagerNoop implements podContainerManager interface.
// It is a no-op implementation and basically does nothing
// podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
// enabled, so Exists() returns true always as the cgroupRoot
// is expected to always exist.
type podContainerManagerNoop struct {
cgroupRoot CgroupName
}
// Make sure that podContainerManagerStub implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerNoop{}
func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return m.cgroupRoot, string(m.cgroupRoot)
}
func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
return ""
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}

View File

@ -0,0 +1,51 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type podContainerManagerStub struct {
}
var _ PodContainerManager = &podContainerManagerStub{}
func (m *podContainerManagerStub) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerStub) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerStub) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return "", ""
}
func (m *podContainerManagerStub) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}

View File

@ -0,0 +1,53 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type unsupportedPodContainerManager struct {
}
var _ PodContainerManager = &unsupportedPodContainerManager{}
func (m *unsupportedPodContainerManager) Exists(_ *v1.Pod) bool {
return true
}
func (m *unsupportedPodContainerManager) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *unsupportedPodContainerManager) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return "", ""
}
func (m *unsupportedPodContainerManager) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *unsupportedPodContainerManager) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *unsupportedPodContainerManager) Destroy(name CgroupName) error {
return nil
}

View File

@ -0,0 +1,359 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"path"
"strings"
"sync"
"time"
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api/v1/resource"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
// how often the qos cgroup manager will perform periodic update
// of the qos level cgroup resource constraints
periodicQOSCgroupUpdateInterval = 1 * time.Minute
)
type QOSContainerManager interface {
Start(func() v1.ResourceList, ActivePodsFunc) error
GetQOSContainersInfo() QOSContainersInfo
UpdateCgroups() error
}
type qosContainerManagerImpl struct {
sync.Mutex
nodeInfo *v1.Node
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot CgroupName
qosReserved map[v1.ResourceName]int64
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
if !nodeConfig.CgroupsPerQOS {
return &qosContainerManagerNoop{
cgroupRoot: CgroupName(cgroupRoot),
}, nil
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: CgroupName(cgroupRoot),
qosReserved: nodeConfig.ExperimentalQOSReserved,
}, nil
}
func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return m.qosContainersInfo
}
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := string(m.cgroupRoot)
if !cm.Exists(CgroupName(rootContainer)) {
return fmt.Errorf("root container %s doesn't exist", rootContainer)
}
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := map[v1.PodQOSClass]string{
v1.PodQOSBurstable: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
}
// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
// get the container's absolute name
absoluteContainerName := CgroupName(containerName)
resourceParameters := &ResourceConfig{}
// the BestEffort QoS class has a statically configured minShares value
if qosClass == v1.PodQOSBestEffort {
minShares := uint64(MinShares)
resourceParameters.CpuShares = &minShares
}
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: absoluteContainerName,
ResourceParameters: resourceParameters,
}
// for each enumerated huge page size, the qos tiers are unbounded
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
m.setHugePagesUnbounded(containerConfig)
}
// check if it exists
if !cm.Exists(absoluteContainerName) {
if err := cm.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
}
} else {
// to ensure we actually have the right state, we update the config on startup
if err := cm.Update(containerConfig); err != nil {
return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
}
}
}
// Store the top level qos container names
m.qosContainersInfo = QOSContainersInfo{
Guaranteed: rootContainer,
Burstable: qosClasses[v1.PodQOSBurstable],
BestEffort: qosClasses[v1.PodQOSBestEffort],
}
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods
// update qos cgroup tiers on startup and in periodic intervals
// to ensure desired state is in sync with actual state.
go wait.Until(func() {
err := m.UpdateCgroups()
if err != nil {
glog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
}
}, periodicQOSCgroupUpdateInterval, wait.NeverStop)
return nil
}
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
hugePageLimit := map[int64]int64{}
for _, pageSize := range cgroupfs.HugePageSizes {
pageSizeBytes, err := units.RAMInBytes(pageSize)
if err != nil {
return err
}
hugePageLimit[pageSizeBytes] = int64(1 << 62)
}
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
return nil
}
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
for _, v := range configs {
if err := m.setHugePagesUnbounded(v); err != nil {
return err
}
}
return nil
}
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
for i := range pods {
pod := pods[i]
qosClass := v1qos.GetPodQOS(pod)
if qosClass != v1.PodQOSBurstable {
// we only care about the burstable qos tier
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceCPU]; found {
burstablePodCPURequest += request.MilliValue()
}
}
// make sure best effort is always 2 shares
bestEffortCPUShares := uint64(MinShares)
configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
// set burstable shares based on current observe state
burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
return nil
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
qosMemoryRequests := map[v1.PodQOSClass]int64{
v1.PodQOSGuaranteed: 0,
v1.PodQOSBurstable: 0,
}
// Sum the pod limits for pods in each QOS class
pods := m.activePods()
for _, pod := range pods {
podMemoryRequest := int64(0)
qosClass := v1qos.GetPodQOS(pod)
if qosClass == v1.PodQOSBestEffort {
// limits are not set for Best Effort pods
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceMemory]; found {
podMemoryRequest += request.Value()
}
qosMemoryRequests[qosClass] += podMemoryRequest
}
resources := m.getNodeAllocatable()
allocatableResource, ok := resources[v1.ResourceMemory]
if !ok {
glog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
return
}
allocatable := allocatableResource.Value()
if allocatable == 0 {
glog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
return
}
for qos, limits := range qosMemoryRequests {
glog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
}
// Calculate QOS memory limits
burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
}
// retrySetMemoryReserve checks for any QoS cgroups over the limit
// that was attempted to be set in the first Update() and adjusts
// their memory limit to the usage to prevent further growth.
func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
// Unreclaimable memory usage may already exceeded the desired limit
// Attempt to set the limit near the current usage to put pressure
// on the cgroup and prevent further growth.
for qos, config := range configs {
stats, err := m.cgroupManager.GetResourceStats(config.Name)
if err != nil {
glog.V(2).Infof("[Container Manager] %v", err)
return
}
usage := stats.MemoryStats.Usage
// Because there is no good way to determine of the original Update()
// on the memory resource was successful, we determine failure of the
// first attempt by checking if the usage is above the limit we attempt
// to set. If it is, we assume the first attempt to set the limit failed
// and try again setting the limit to the usage. Otherwise we leave
// the CgroupConfig as is.
if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
configs[qos].ResourceParameters.Memory = &usage
}
}
}
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSBurstable: {
Name: CgroupName(m.qosContainersInfo.Burstable),
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: CgroupName(m.qosContainersInfo.BestEffort),
ResourceParameters: &ResourceConfig{},
},
}
// update the qos level cgroup settings for cpu shares
if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
return err
}
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
glog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
return err
}
}
glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry")
return nil
}
type qosContainerManagerNoop struct {
cgroupRoot CgroupName
}
var _ QOSContainerManager = &qosContainerManagerNoop{}
func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
return nil
}
func (m *qosContainerManagerNoop) UpdateCgroups() error {
return nil
}

123
vendor/k8s.io/kubernetes/pkg/kubelet/cm/types.go generated vendored Normal file
View File

@ -0,0 +1,123 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
// ResourceConfig holds information about all the supported cgroup resource parameters.
type ResourceConfig struct {
// Memory limit (in bytes).
Memory *int64
// CPU shares (relative weight vs. other containers).
CpuShares *uint64
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota *int64
// CPU quota period.
CpuPeriod *uint64
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
type CgroupName string
// CgroupConfig holds the cgroup configuration information.
// This is common object which is used to specify
// cgroup information to both systemd and raw cgroup fs
// implementation of the Cgroup Manager interface.
type CgroupConfig struct {
// Fully qualified name prior to any driver specific conversions.
Name CgroupName
// ResourceParameters contains various cgroups settings to apply.
ResourceParameters *ResourceConfig
}
// MemoryStats holds the on-demand statistics from the memory cgroup
type MemoryStats struct {
// Memory usage (in bytes).
Usage int64
}
// ResourceStats holds on-demand statistics from various cgroup subsystems
type ResourceStats struct {
// Memory statistics.
MemoryStats *MemoryStats
}
// CgroupManager allows for cgroup management.
// Supports Cgroup Creation ,Deletion and Updates.
type CgroupManager interface {
// Create creates and applies the cgroup configurations on the cgroup.
// It just creates the leaf cgroups.
// It expects the parent cgroup to already exist.
Create(*CgroupConfig) error
// Destroy the cgroup.
Destroy(*CgroupConfig) error
// Update cgroup configuration.
Update(*CgroupConfig) error
// Exists checks if the cgroup already exists
Exists(name CgroupName) bool
// Name returns the literal cgroupfs name on the host after any driver specific conversions.
// We would expect systemd implementation to make appropriate name conversion.
// For example, if we pass /foo/bar
// then systemd should convert the name to something like
// foo.slice/foo-bar.slice
Name(name CgroupName) string
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
CgroupName(name string) CgroupName
// Pids scans through all subsystems to find pids associated with specified cgroup.
Pids(name CgroupName) []int
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(cgroupName CgroupName) error
// GetResourceStats returns statistics of the specified cgroup as read from the cgroup fs.
GetResourceStats(name CgroupName) (*ResourceStats, error)
}
// QOSContainersInfo stores the names of containers per qos
type QOSContainersInfo struct {
Guaranteed string
BestEffort string
Burstable string
}
// PodContainerManager stores and manages pod level containers
// The Pod workers interact with the PodContainerManager to create and destroy
// containers for the pod.
type PodContainerManager interface {
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
GetPodContainerName(*v1.Pod) (CgroupName, string)
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod cgroup doesn't already exist this method creates it.
EnsureExists(*v1.Pod) error
// Exists returns true if the pod cgroup exists.
Exists(*v1.Pod) bool
// Destroy takes a pod Cgroup name as argument and destroys the pod's container.
Destroy(name CgroupName) error
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(name CgroupName) error
// GetAllPodsFromCgroups enumerates the set of pod uids to their associated cgroup based on state of cgroupfs system.
GetAllPodsFromCgroups() (map[types.UID]CgroupName, error)
}

39
vendor/k8s.io/kubernetes/pkg/kubelet/cm/util/BUILD generated vendored Normal file
View File

@ -0,0 +1,39 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = [
"cgroups_unsupported.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroups_linux.go",
],
"//conditions:default": [],
}),
importpath = "k8s.io/kubernetes/pkg/kubelet/cm/util",
deps = select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library",
],
"//conditions:default": [],
}),
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)

View File

@ -0,0 +1,76 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"path/filepath"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainerutils "github.com/opencontainers/runc/libcontainer/utils"
)
// Forked from opencontainers/runc/libcontainer/cgroup/fs.Manager.GetPids()
func GetPids(cgroupPath string) ([]int, error) {
dir, err := getCgroupPath(cgroupPath)
if err != nil {
return nil, err
}
return libcontainercgroups.GetPids(dir)
}
// getCgroupPath gets the file path to the "devices" subsystem of the desired cgroup.
// cgroupPath is the path in the cgroup hierarchy.
func getCgroupPath(cgroupPath string) (string, error) {
cgroupPath = libcontainerutils.CleanPath(cgroupPath)
mnt, root, err := libcontainercgroups.FindCgroupMountpointAndRoot("devices")
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgroupPath) {
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, mnt, cgroupPath), nil
}
parentPath, err := getCgroupParentPath(mnt, root)
if err != nil {
return "", err
}
return filepath.Join(parentPath, cgroupPath), nil
}
// getCgroupParentPath gets the parent filepath to this cgroup, for resolving relative cgroup paths.
func getCgroupParentPath(mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := libcontainercgroups.GetOwnCgroup("devices")
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see paths from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}

View File

@ -0,0 +1,23 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
func GetPids(cgroupPath string) ([]int, error) {
return nil, nil
}