build: move e2e dependencies into e2e/go.mod

Several packages are only used while running the e2e suite. These
packages are less important to update, as the they can not influence the
final executable that is part of the Ceph-CSI container-image.

By moving these dependencies out of the main Ceph-CSI go.mod, it is
easier to identify if a reported CVE affects Ceph-CSI, or only the
testing (like most of the Kubernetes CVEs).

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-03-04 08:57:28 +01:00
committed by mergify[bot]
parent 15da101b1b
commit bec6090996
8047 changed files with 1407827 additions and 3453 deletions

View File

@ -0,0 +1,11 @@
---
dir: testing
filename: "mock_{{.InterfaceName | snakecase}}.go"
boilerplate-file: ../../../hack/boilerplate/boilerplate.generatego.txt
outpkg: testing
with-expecter: true
packages:
k8s.io/kubernetes/pkg/kubelet/cm:
interfaces:
ContainerManager:
PodContainerManager:

13
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/OWNERS generated vendored Normal file
View File

@ -0,0 +1,13 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- Random-Liu
- dchen1107
- derekwaynecarr
- yujuhong
- klueska
reviewers:
- sig-node-reviewers
emeritus_approvers:
- ConnorDoyle
- vishh

View File

@ -0,0 +1,62 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package admission
import (
"errors"
"fmt"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
const (
ErrorReasonUnexpected = "UnexpectedAdmissionError"
)
type Error interface {
Error() string
Type() string
}
type unexpectedAdmissionError struct{ Err error }
var _ Error = (*unexpectedAdmissionError)(nil)
func (e *unexpectedAdmissionError) Error() string {
return fmt.Sprintf("Allocate failed due to %v, which is unexpected", e.Err)
}
func (e *unexpectedAdmissionError) Type() string {
return ErrorReasonUnexpected
}
func GetPodAdmitResult(err error) lifecycle.PodAdmitResult {
if err == nil {
return lifecycle.PodAdmitResult{Admit: true}
}
var admissionErr Error
if !errors.As(err, &admissionErr) {
admissionErr = &unexpectedAdmissionError{err}
}
return lifecycle.PodAdmitResult{
Message: admissionErr.Error(),
Reason: admissionErr.Type(),
Admit: false,
}
}

View File

@ -0,0 +1,485 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strings"
"sync"
"time"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
libcontainercgroupmanager "github.com/opencontainers/runc/libcontainer/cgroups/manager"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)
var RootCgroupName = CgroupName([]string{})
// NewCgroupName composes a new cgroup name.
// Use RootCgroupName as base to start at the root.
// This function does some basic check for invalid characters at the name.
func NewCgroupName(base CgroupName, components ...string) CgroupName {
for _, component := range components {
// Forbit using "_" in internal names. When remapping internal
// names to systemd cgroup driver, we want to remap "-" => "_",
// so we forbid "_" so that we can always reverse the mapping.
if strings.Contains(component, "/") || strings.Contains(component, "_") {
panic(fmt.Errorf("invalid character in component [%q] of CgroupName", component))
}
}
return CgroupName(append(append([]string{}, base...), components...))
}
func escapeSystemdCgroupName(part string) string {
return strings.Replace(part, "-", "_", -1)
}
func unescapeSystemdCgroupName(part string) string {
return strings.Replace(part, "_", "-", -1)
}
// cgroupName.ToSystemd converts the internal cgroup name to a systemd name.
// For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes
// "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice"
// This function always expands the systemd name into the cgroupfs form. If only
// the last part is needed, use path.Base(...) on it to discard the rest.
func (cgroupName CgroupName) ToSystemd() string {
if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") {
return "/"
}
newparts := []string{}
for _, part := range cgroupName {
part = escapeSystemdCgroupName(part)
newparts = append(newparts, part)
}
result, err := cgroupsystemd.ExpandSlice(strings.Join(newparts, "-") + systemdSuffix)
if err != nil {
// Should never happen...
panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err))
}
return result
}
func ParseSystemdToCgroupName(name string) CgroupName {
driverName := path.Base(name)
driverName = strings.TrimSuffix(driverName, systemdSuffix)
parts := strings.Split(driverName, "-")
result := []string{}
for _, part := range parts {
result = append(result, unescapeSystemdCgroupName(part))
}
return CgroupName(result)
}
func (cgroupName CgroupName) ToCgroupfs() string {
return "/" + path.Join(cgroupName...)
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
components := strings.Split(strings.TrimPrefix(name, "/"), "/")
if len(components) == 1 && components[0] == "" {
components = []string{}
}
return CgroupName(components)
}
func IsSystemdStyleName(name string) bool {
return strings.HasSuffix(name, systemdSuffix)
}
// CgroupSubsystems holds information about the mounted cgroup subsystems
type CgroupSubsystems struct {
// Cgroup subsystem mounts.
// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
Mounts []libcontainercgroups.Mount
// Cgroup subsystem to their mount location.
// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
MountPoints map[string]string
}
// cgroupCommon implements common tasks
// that are valid for both cgroup v1 and v2.
// This prevents duplicating the code between
// v1 and v2 specific implementations.
type cgroupCommon struct {
// subsystems holds information about all the
// mounted cgroup subsystems on the node
subsystems *CgroupSubsystems
// useSystemd tells if systemd cgroup manager should be used.
useSystemd bool
}
// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface
var _ CgroupManager = &cgroupV1impl{}
var _ CgroupManager = &cgroupV2impl{}
// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
if libcontainercgroups.IsCgroup2UnifiedMode() {
return NewCgroupV2Manager(cs, cgroupDriver)
}
return NewCgroupV1Manager(cs, cgroupDriver)
}
func newCgroupCommon(cs *CgroupSubsystems, cgroupDriver string) cgroupCommon {
return cgroupCommon{
subsystems: cs,
useSystemd: cgroupDriver == "systemd",
}
}
// Name converts the cgroup to the driver specific value in cgroupfs form.
// This always returns a valid cgroupfs path even when systemd driver is in use!
func (m *cgroupCommon) Name(name CgroupName) string {
if m.useSystemd {
return name.ToSystemd()
}
return name.ToCgroupfs()
}
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
func (m *cgroupCommon) CgroupName(name string) CgroupName {
if m.useSystemd {
return ParseSystemdToCgroupName(name)
}
return ParseCgroupfsToCgroupName(name)
}
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
func (m *cgroupCommon) buildCgroupPaths(name CgroupName) map[string]string {
cgroupFsAdaptedName := m.Name(name)
cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
for key, val := range m.subsystems.MountPoints {
cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
}
return cgroupPaths
}
// libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config.
func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup {
config := &libcontainerconfigs.Cgroup{
Systemd: m.useSystemd,
}
if needResources {
config.Resources = m.toResources(in.ResourceParameters)
} else {
config.Resources = &libcontainerconfigs.Resources{}
}
if !config.Systemd {
// For fs cgroup manager, we can either set Path or Name and Parent.
// Setting Path is easier.
config.Path = in.Name.ToCgroupfs()
return config
}
// For systemd, we have to set Name and Parent, as they are needed to talk to systemd.
// Setting Path is optional as it can be deduced from Name and Parent.
// TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead.
// It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent)
// and split it appropriately, using essentially the logic below.
// This was done for cgroupfs in opencontainers/runc#497 but a counterpart
// for systemd was never introduced.
dir, base := path.Split(in.Name.ToSystemd())
if dir == "/" {
dir = "-.slice"
} else {
dir = path.Base(dir)
}
config.Parent = dir
config.Name = base
return config
}
// Destroy destroys the specified cgroup
func (m *cgroupCommon) Destroy(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, false)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return err
}
// Delete cgroups using libcontainers Managers Destroy() method
if err = manager.Destroy(); err != nil {
return fmt.Errorf("unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
}
return nil
}
func (m *cgroupCommon) SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error {
containerConfig := &CgroupConfig{
Name: name,
ResourceParameters: resourceConfig,
}
return m.Update(containerConfig)
}
// getCPUWeight converts from the range [2, 262144] to [1, 10000]
func getCPUWeight(cpuShares *uint64) uint64 {
if cpuShares == nil {
return 0
}
if *cpuShares >= 262144 {
return 10000
}
return 1 + ((*cpuShares-2)*9999)/262142
}
var (
availableRootControllersOnce sync.Once
availableRootControllers sets.Set[string]
)
func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
resources := &libcontainerconfigs.Resources{
SkipDevices: true,
SkipFreezeOnSet: true,
}
if resourceConfig == nil {
return resources
}
if resourceConfig.Memory != nil {
resources.Memory = *resourceConfig.Memory
}
if resourceConfig.CPUShares != nil {
if libcontainercgroups.IsCgroup2UnifiedMode() {
resources.CpuWeight = getCPUWeight(resourceConfig.CPUShares)
} else {
resources.CpuShares = *resourceConfig.CPUShares
}
}
if resourceConfig.CPUQuota != nil {
resources.CpuQuota = *resourceConfig.CPUQuota
}
if resourceConfig.CPUPeriod != nil {
resources.CpuPeriod = *resourceConfig.CPUPeriod
}
if resourceConfig.PidsLimit != nil {
resources.PidsLimit = *resourceConfig.PidsLimit
}
if !resourceConfig.CPUSet.IsEmpty() {
resources.CpusetCpus = resourceConfig.CPUSet.String()
}
m.maybeSetHugetlb(resourceConfig, resources)
// Ideally unified is used for all the resources when running on cgroup v2.
// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller
// you can specify the correct setting without relying on the conversions performed by the OCI runtime.
if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() {
resources.Unified = make(map[string]string)
for k, v := range resourceConfig.Unified {
resources.Unified[k] = v
}
}
return resources
}
func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) {
// Check if hugetlb is supported.
if libcontainercgroups.IsCgroup2UnifiedMode() {
if !getSupportedUnifiedControllers().Has("hugetlb") {
klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
return
}
} else if _, ok := m.subsystems.MountPoints["hugetlb"]; !ok {
klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
return
}
// For each page size enumerated, set that value.
pageSizes := sets.New[string]()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize)
if err != nil {
klog.InfoS("Invalid pageSize", "err", err)
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range libcontainercgroups.HugePageSizes() {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
// Update updates the cgroup with the specified Cgroup Configuration
func (m *cgroupCommon) Update(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return fmt.Errorf("failed to create cgroup manager: %v", err)
}
return manager.Set(libcontainerCgroupConfig.Resources)
}
// Create creates the specified cgroup
func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return err
}
// Apply(-1) is a hack to create the cgroup directories for each resource
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
// configuration to the process with the specified pid.
// It creates cgroup files for each subsystems and writes the pid
// in the tasks file. We use the function to create all the required
// cgroup files but not attach any "real" pid to the cgroup.
if err := manager.Apply(-1); err != nil {
return err
}
// it may confuse why we call set after we do apply, but the issue is that runc
// follows a similar pattern. it's needed to ensure cpu quota is set properly.
if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
}
return nil
}
// Scans through all subsystems to find pids associated with specified cgroup.
func (m *cgroupCommon) Pids(name CgroupName) []int {
// we need the driver specific name
cgroupFsName := m.Name(name)
// Get a list of processes that we need to kill
pidsToKill := sets.New[int]()
var pids []int
for _, val := range m.subsystems.MountPoints {
dir := path.Join(val, cgroupFsName)
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// The subsystem pod cgroup is already deleted
// do nothing, continue
continue
}
// Get a list of pids that are still charged to the pod's cgroup
pids, err = getCgroupProcs(dir)
if err != nil {
continue
}
pidsToKill.Insert(pids...)
// WalkFunc which is called for each file and directory in the pod cgroup dir
visitor := func(path string, info os.FileInfo, err error) error {
if err != nil {
klog.V(4).InfoS("Cgroup manager encountered error scanning cgroup path", "path", path, "err", err)
return filepath.SkipDir
}
if !info.IsDir() {
return nil
}
pids, err = getCgroupProcs(path)
if err != nil {
klog.V(4).InfoS("Cgroup manager encountered error getting procs for cgroup path", "path", path, "err", err)
return filepath.SkipDir
}
pidsToKill.Insert(pids...)
return nil
}
// Walk through the pod cgroup directory to check if
// container cgroups haven't been GCed yet. Get attached processes to
// all such unwanted containers under the pod cgroup
if err = filepath.Walk(dir, visitor); err != nil {
klog.V(4).InfoS("Cgroup manager encountered error scanning pids for directory", "path", dir, "err", err)
}
}
return sets.List(pidsToKill)
}
// ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
func (m *cgroupCommon) ReduceCPULimits(cgroupName CgroupName) error {
// Set lowest possible CpuShares value for the cgroup
minimumCPUShares := uint64(MinShares)
resources := &ResourceConfig{
CPUShares: &minimumCPUShares,
}
containerConfig := &CgroupConfig{
Name: cgroupName,
ResourceParameters: resources,
}
return m.Update(containerConfig)
}
func readCgroupMemoryConfig(cgroupPath string, memLimitFile string) (*ResourceConfig, error) {
memLimit, err := fscommon.GetCgroupParamUint(cgroupPath, memLimitFile)
if err != nil {
return nil, fmt.Errorf("failed to read %s for cgroup %v: %v", memLimitFile, cgroupPath, err)
}
mLim := int64(memLimit)
//TODO(vinaykul,InPlacePodVerticalScaling): Add memory request support
return &ResourceConfig{Memory: &mLim}, nil
}

View File

@ -0,0 +1,120 @@
//go:build !linux
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
v1 "k8s.io/api/core/v1"
)
type unsupportedCgroupManager struct{}
var errNotSupported = errors.New("Cgroup Manager is not supported in this build")
// Make sure that unsupportedCgroupManager implements the CgroupManager interface
var _ CgroupManager = &unsupportedCgroupManager{}
type CgroupSubsystems struct {
Mounts []interface{}
MountPoints map[string]string
}
func NewCgroupManager(_ interface{}) CgroupManager {
return &unsupportedCgroupManager{}
}
func (m *unsupportedCgroupManager) Version() int {
return 0
}
func (m *unsupportedCgroupManager) Name(_ CgroupName) string {
return ""
}
func (m *unsupportedCgroupManager) Validate(_ CgroupName) error {
return errNotSupported
}
func (m *unsupportedCgroupManager) Exists(_ CgroupName) bool {
return false
}
func (m *unsupportedCgroupManager) Destroy(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Update(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
return errNotSupported
}
func (m *unsupportedCgroupManager) MemoryUsage(_ CgroupName) (int64, error) {
return -1, errNotSupported
}
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
return nil
}
func (m *unsupportedCgroupManager) CgroupName(name string) CgroupName {
return CgroupName([]string{})
}
func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error {
return nil
}
func (m *unsupportedCgroupManager) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
return nil, errNotSupported
}
func (m *unsupportedCgroupManager) SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error {
return errNotSupported
}
var RootCgroupName = CgroupName([]string{})
func NewCgroupName(base CgroupName, components ...string) CgroupName {
return append(append([]string{}, base...), components...)
}
func (cgroupName CgroupName) ToSystemd() string {
return ""
}
func ParseSystemdToCgroupName(name string) CgroupName {
return nil
}
func (cgroupName CgroupName) ToCgroupfs() string {
return ""
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
return nil
}
func IsSystemdStyleName(name string) bool {
return false
}

View File

@ -0,0 +1,145 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
"fmt"
"strconv"
"strings"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
)
const cgroupv1MemLimitFile string = "memory.limit_in_bytes"
// cgroupV1impl implements the CgroupManager interface
// for cgroup v1.
// It's a stateless object which can be used to
// update, create or delete any number of cgroups
// It relies on runc/libcontainer cgroup managers.
type cgroupV1impl struct {
cgroupCommon
}
func NewCgroupV1Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
return &cgroupV1impl{
cgroupCommon: newCgroupCommon(cs, cgroupDriver),
}
}
// Version of the cgroup implementation on the host
func (c *cgroupV1impl) Version() int {
return 1
}
// Validate checks if all subsystem cgroups are valid
func (c *cgroupV1impl) Validate(name CgroupName) error {
// Get map of all cgroup paths on the system for the particular cgroup
cgroupPaths := c.buildCgroupPaths(name)
// the presence of alternative control groups not known to runc confuses
// the kubelet existence checks.
// ideally, we would have a mechanism in runc to support Exists() logic
// scoped to the set control groups it understands. this is being discussed
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
allowlistControllers := sets.New[string]("cpu", "cpuacct", "cpuset", "memory", "systemd", "pids")
if _, ok := c.subsystems.MountPoints["hugetlb"]; ok {
allowlistControllers.Insert("hugetlb")
}
var missingPaths []string
// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
for controller, path := range cgroupPaths {
// ignore mounts we don't care about
if !allowlistControllers.Has(controller) {
continue
}
if !libcontainercgroups.PathExists(path) {
missingPaths = append(missingPaths, path)
}
}
if len(missingPaths) > 0 {
return fmt.Errorf("cgroup %q has some missing paths: %v", name, strings.Join(missingPaths, ", "))
}
return nil
}
// Exists checks if all subsystem cgroups already exist
func (c *cgroupV1impl) Exists(name CgroupName) bool {
return c.Validate(name) == nil
}
// MemoryUsage returns the current memory usage of the specified cgroup,
// as read from cgroupfs.
func (c *cgroupV1impl) MemoryUsage(name CgroupName) (int64, error) {
var path, file string
mp, ok := c.subsystems.MountPoints["memory"]
if !ok { // should not happen
return -1, errors.New("no cgroup v1 mountpoint for memory controller found")
}
path = mp + "/" + c.Name(name)
file = "memory.usage_in_bytes"
val, err := fscommon.GetCgroupParamUint(path, file)
return int64(val), err
}
// Get the resource config values applied to the cgroup for specified resource type
func (c *cgroupV1impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
cgroupPaths := c.buildCgroupPaths(name)
cgroupResourcePath, found := cgroupPaths[string(resource)]
if !found {
return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
}
switch resource {
case v1.ResourceCPU:
return c.getCgroupCPUConfig(cgroupResourcePath)
case v1.ResourceMemory:
return c.getCgroupMemoryConfig(cgroupResourcePath)
}
return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
}
func (c *cgroupV1impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupPath, "cpu.cfs_quota_us")
if errQ != nil {
return nil, fmt.Errorf("failed to read CPU quota for cgroup %v: %w", cgroupPath, errQ)
}
cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64)
if errInt != nil {
return nil, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %w", cgroupPath, errInt)
}
cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupPath, "cpu.cfs_period_us")
if errP != nil {
return nil, fmt.Errorf("failed to read CPU period for cgroup %v: %w", cgroupPath, errP)
}
cpuShares, errS := fscommon.GetCgroupParamUint(cgroupPath, "cpu.shares")
if errS != nil {
return nil, fmt.Errorf("failed to read CPU shares for cgroup %v: %w", cgroupPath, errS)
}
return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuQuota, CPUPeriod: &cpuPeriod}, nil
}
func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile)
}

View File

@ -0,0 +1,177 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
)
const cgroupv2MemLimitFile string = "memory.max"
// cgroupV2impl implements the CgroupManager interface
// for cgroup v2.
// It's a stateless object which can be used to
// update, create or delete any number of cgroups
// It relies on runc/libcontainer cgroup managers.
type cgroupV2impl struct {
cgroupCommon
}
func NewCgroupV2Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
return &cgroupV2impl{
cgroupCommon: newCgroupCommon(cs, cgroupDriver),
}
}
// Version of the cgroup implementation on the host
func (c *cgroupV2impl) Version() int {
return 2
}
// Validate checks if all subsystem cgroups are valid
func (c *cgroupV2impl) Validate(name CgroupName) error {
cgroupPath := c.buildCgroupUnifiedPath(name)
neededControllers := getSupportedUnifiedControllers()
enabledControllers, err := readUnifiedControllers(cgroupPath)
if err != nil {
return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err)
}
difference := neededControllers.Difference(enabledControllers)
if difference.Len() > 0 {
return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", "))
}
return nil
}
// Exists checks if all subsystem cgroups already exist
func (c *cgroupV2impl) Exists(name CgroupName) bool {
return c.Validate(name) == nil
}
// MemoryUsage returns the current memory usage of the specified cgroup,
// as read from cgroupfs.
func (c *cgroupV2impl) MemoryUsage(name CgroupName) (int64, error) {
var path, file string
path = c.buildCgroupUnifiedPath(name)
file = "memory.current"
val, err := fscommon.GetCgroupParamUint(path, file)
return int64(val), err
}
// Get the resource config values applied to the cgroup for specified resource type
func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
cgroupPaths := c.buildCgroupPaths(name)
cgroupResourcePath, found := cgroupPaths[string(resource)]
if !found {
return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
}
switch resource {
case v1.ResourceCPU:
return c.getCgroupCPUConfig(cgroupResourcePath)
case v1.ResourceMemory:
return c.getCgroupMemoryConfig(cgroupResourcePath)
}
return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
}
func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
var cpuLimitStr, cpuPeriodStr string
cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
if err != nil {
return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err)
}
numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
if errScan != nil || numItems != 2 {
return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w",
cpuLimitAndPeriod, cgroupPath, errScan)
}
cpuLimit := int64(-1)
if cpuLimitStr != Cgroup2MaxCpuLimit {
cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %w", cgroupPath, err)
}
}
cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64)
if errPeriod != nil {
return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod)
}
cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
if errWeight != nil {
return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight)
}
cpuShares := cpuWeightToCPUShares(cpuWeight)
return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil
}
func (c *cgroupV2impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
return readCgroupMemoryConfig(cgroupPath, cgroupv2MemLimitFile)
}
// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2
func getSupportedUnifiedControllers() sets.Set[string] {
// This is the set of controllers used by the Kubelet
supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids")
// Memoize the set of controllers that are present in the root cgroup
availableRootControllersOnce.Do(func() {
var err error
availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot)
if err != nil {
panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot))
}
})
// Return the set of controllers that are supported both by the Kubelet and by the kernel
return supportedControllers.Intersection(availableRootControllers)
}
// readUnifiedControllers reads the controllers available at the specified cgroup
func readUnifiedControllers(path string) (sets.Set[string], error) {
controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers"))
if err != nil {
return nil, err
}
controllers := strings.Fields(string(controllersFileContent))
return sets.New(controllers...), nil
}
// buildCgroupUnifiedPath builds a path to the specified name.
func (c *cgroupV2impl) buildCgroupUnifiedPath(name CgroupName) string {
cgroupFsAdaptedName := c.Name(name)
return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName)
}
// Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
return uint64((((cpuShares - 2) * 9999) / 262142) + 1)
}
// Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
}

View File

@ -0,0 +1,283 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
//go:generate mockery
package cm
import (
"context"
"fmt"
"strconv"
"strings"
"time"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
// TODO: Migrate kubelet to either use its own internal objects or client library.
v1 "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/utils/cpuset"
)
const (
// Warning message for the users still using cgroup v1
CgroupV1MaintenanceModeWarning = "cgroup v1 support is in maintenance mode, please migrate to cgroup v2"
// Warning message for the users using cgroup v2 on kernel doesn't support root `cpu.stat`.
// `cpu.stat` was added to root cgroup in kernel 5.8.
// (ref: https://github.com/torvalds/linux/commit/936f2a70f2077f64fab1dcb3eca71879e82ecd3f)
CgroupV2KernelWarning = "cgroup v2 is being used on a kernel, which doesn't support root `cpu.stat`." +
"Kubelet will continue, but may experience instability or wrong behavior"
)
type ActivePodsFunc func() []*v1.Pod
type GetNodeFunc func() (*v1.Node, error)
// Manages the containers running on a machine.
type ContainerManager interface {
// Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run.
Start(context.Context, *v1.Node, ActivePodsFunc, GetNodeFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService, bool) error
// SystemCgroupsLimit returns resources allocated to system cgroups in the machine.
// These cgroups include the system and Kubernetes services.
SystemCgroupsLimit() v1.ResourceList
// GetNodeConfig returns a NodeConfig that is being used by the container manager.
GetNodeConfig() NodeConfig
// Status returns internal Status.
Status() Status
// NewPodContainerManager is a factory method which returns a podContainerManager object
// Returns a noop implementation if qos cgroup hierarchy is not enabled
NewPodContainerManager() PodContainerManager
// GetMountedSubsystems returns the mounted cgroup subsystems on the node
GetMountedSubsystems() *CgroupSubsystems
// GetQOSContainersInfo returns the names of top level QoS containers
GetQOSContainersInfo() QOSContainersInfo
// GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling.
GetNodeAllocatableReservation() v1.ResourceList
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList
// GetDevicePluginResourceCapacity returns the node capacity (amount of total device plugin resources),
// node allocatable (amount of total healthy resources reported by device plugin),
// and inactive device plugin resources previously registered on the node.
GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string)
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
// level QoS containers have their desired state in a thread-safe way
UpdateQOSCgroups() error
// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container.
GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
// UpdatePluginResources calls Allocate of device plugin handler for potential
// requests for device plugin resources, and returns an error if fails.
// Otherwise, it updates allocatableResource in nodeInfo if necessary,
// to make sure it is at least equal to the pod's requested capacity for
// any registered device plugin resource
UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error
InternalContainerLifecycle() InternalContainerLifecycle
// GetPodCgroupRoot returns the cgroup which contains all pods.
GetPodCgroupRoot() string
// GetPluginRegistrationHandlers returns a set of plugin registration handlers
// The pluginwatcher's Handlers allow to have a single module for handling
// registration.
GetPluginRegistrationHandlers() map[string]cache.PluginHandler
// GetHealthCheckers returns a set of health checkers for all plugins.
// These checkers are integrated into the systemd watchdog to monitor the service's health.
GetHealthCheckers() []healthz.HealthChecker
// ShouldResetExtendedResourceCapacity returns whether or not the extended resources should be zeroed,
// due to node recreation.
ShouldResetExtendedResourceCapacity() bool
// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources.
GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
GetNodeAllocatableAbsolute() v1.ResourceList
// PrepareDynamicResource prepares dynamic pod resources
PrepareDynamicResources(context.Context, *v1.Pod) error
// UnprepareDynamicResources unprepares dynamic pod resources
UnprepareDynamicResources(context.Context, *v1.Pod) error
// PodMightNeedToUnprepareResources returns true if the pod with the given UID
// might need to unprepare resources.
PodMightNeedToUnprepareResources(UID types.UID) bool
// UpdateAllocatedResourcesStatus updates the status of allocated resources for the pod.
UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus)
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
// Implements the PodResources Provider API
podresources.CPUsProvider
podresources.DevicesProvider
podresources.MemoryProvider
podresources.DynamicResourcesProvider
}
type NodeConfig struct {
NodeName types.NodeName
RuntimeCgroupsName string
SystemCgroupsName string
KubeletCgroupsName string
KubeletOOMScoreAdj int32
ContainerRuntime string
CgroupsPerQOS bool
CgroupRoot string
CgroupDriver string
KubeletRootDir string
ProtectKernelDefaults bool
NodeAllocatableConfig
QOSReserved map[v1.ResourceName]int64
CPUManagerPolicy string
CPUManagerPolicyOptions map[string]string
TopologyManagerScope string
CPUManagerReconcilePeriod time.Duration
ExperimentalMemoryManagerPolicy string
ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation
PodPidsLimit int64
EnforceCPULimits bool
CPUCFSQuotaPeriod time.Duration
TopologyManagerPolicy string
TopologyManagerPolicyOptions map[string]string
CgroupVersion int
}
type NodeAllocatableConfig struct {
KubeReservedCgroupName string
SystemReservedCgroupName string
ReservedSystemCPUs cpuset.CPUSet
EnforceNodeAllocatable sets.Set[string]
KubeReserved v1.ResourceList
SystemReserved v1.ResourceList
HardEvictionThresholds []evictionapi.Threshold
}
type Status struct {
// Any soft requirements that were unsatisfied.
SoftRequirements error
}
func int64Slice(in []int) []int64 {
out := make([]int64, len(in))
for i := range in {
out[i] = int64(in[i])
}
return out
}
// parsePercentage parses the percentage string to numeric value.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {
return 0, fmt.Errorf("percentage expected, got '%s'", v)
}
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
if err != nil {
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
}
if percentage < 0 || percentage > 100 {
return 0, fmt.Errorf("percentage must be between 0 and 100")
}
return percentage, nil
}
// ParseQOSReserved parses the --qos-reserved option
func ParseQOSReserved(m map[string]string) (*map[v1.ResourceName]int64, error) {
reservations := make(map[v1.ResourceName]int64)
for k, v := range m {
switch v1.ResourceName(k) {
// Only memory resources are supported.
case v1.ResourceMemory:
q, err := parsePercentage(v)
if err != nil {
return nil, fmt.Errorf("failed to parse percentage %q for %q resource: %w", v, k, err)
}
reservations[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return &reservations, nil
}
func containerDevicesFromResourceDeviceInstances(devs devicemanager.ResourceDeviceInstances) []*podresourcesapi.ContainerDevices {
var respDevs []*podresourcesapi.ContainerDevices
for resourceName, resourceDevs := range devs {
for devID, dev := range resourceDevs {
topo := dev.GetTopology()
if topo == nil {
// Some device plugin do not report the topology information.
// This is legal, so we report the devices anyway,
// let the client decide what to do.
respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
ResourceName: resourceName,
DeviceIds: []string{devID},
})
continue
}
for _, node := range topo.GetNodes() {
respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
ResourceName: resourceName,
DeviceIds: []string{devID},
Topology: &podresourcesapi.TopologyInfo{
Nodes: []*podresourcesapi.NUMANode{
{
ID: node.GetID(),
},
},
},
})
}
}
}
return respDevs
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,211 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type containerManagerStub struct {
shouldResetExtendedResourceCapacity bool
extendedPluginResources v1.ResourceList
}
var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
klog.V(2).InfoS("Starting stub container manager")
return nil
}
func (cm *containerManagerStub) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerStub) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (cm *containerManagerStub) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerStub) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerStub) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerStub) Status() Status {
return Status{}
}
func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
if !localStorageCapacityIsolation {
return v1.ResourceList{}
}
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(0),
resource.BinarySI),
}
return c
}
func (cm *containerManagerStub) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
return nil
}
func (cm *containerManagerStub) GetHealthCheckers() []healthz.HealthChecker {
return []healthz.HealthChecker{}
}
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
return cm.extendedPluginResources, cm.extendedPluginResources, []string{}
}
func (m *podContainerManagerStub) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
return nil, fmt.Errorf("not implemented")
}
func (m *podContainerManagerStub) SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error {
return fmt.Errorf("not implemented")
}
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerStub) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *containerManagerStub) UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager(), memorymanager.NewFakeManager(), topologymanager.NewFakeManager()}
}
func (cm *containerManagerStub) GetPodCgroupRoot() string {
return ""
}
func (cm *containerManagerStub) GetDevices(_, _ string) []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerStub) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerStub) ShouldResetExtendedResourceCapacity() bool {
return cm.shouldResetExtendedResourceCapacity
}
func (cm *containerManagerStub) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
return topologymanager.NewFakeManager()
}
func (cm *containerManagerStub) UpdateAllocatedDevices() {
return
}
func (cm *containerManagerStub) GetCPUs(_, _ string) []int64 {
return nil
}
func (cm *containerManagerStub) GetAllocatableCPUs() []int64 {
return nil
}
func (cm *containerManagerStub) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerStub) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerStub) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerStub) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerStub) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}
func (cm *containerManagerStub) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
}
func (cm *containerManagerStub) Updates() <-chan resourceupdates.Update {
return nil
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: false}
}
func NewStubContainerManagerWithExtendedResource(shouldResetExtendedResourceCapacity bool) ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: shouldResetExtendedResourceCapacity}
}
func NewStubContainerManagerWithDevicePluginResource(extendedPluginResources v1.ResourceList) ContainerManager {
return &containerManagerStub{
shouldResetExtendedResourceCapacity: false,
extendedPluginResources: extendedPluginResources,
}
}

View File

@ -0,0 +1,49 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"fmt"
"k8s.io/mount-utils"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/cri-api/pkg/apis"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type unsupportedContainerManager struct {
containerManagerStub
}
var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
return fmt.Errorf("Container Manager is unsupported in this build")
}
func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) {
return &unsupportedContainerManager{}, nil
}

View File

@ -0,0 +1,371 @@
//go:build windows
// +build windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// containerManagerImpl implements container manager on Windows.
// Only GetNodeAllocatableReservation() and GetCapacity() are implemented now.
package cm
import (
"context"
"fmt"
"sync"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/klog/v2"
"k8s.io/mount-utils"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/cri-api/pkg/apis"
pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type containerManagerImpl struct {
// Capacity of this node.
capacity v1.ResourceList
// Interface for cadvisor.
cadvisorInterface cadvisor.Interface
// Config of this node.
nodeConfig NodeConfig
// Interface for exporting and allocating devices reported by device plugins.
deviceManager devicemanager.Manager
// Interface for Topology resource co-ordination
topologyManager topologymanager.Manager
cpuManager cpumanager.Manager
memoryManager memorymanager.Manager
nodeInfo *v1.Node
sync.RWMutex
}
func (cm *containerManagerImpl) Start(ctx context.Context, node *v1.Node,
activePods ActivePodsFunc,
getNode GetNodeFunc,
sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService,
localStorageCapacityIsolation bool) error {
klog.V(2).InfoS("Starting Windows container manager")
cm.nodeInfo = node
if localStorageCapacityIsolation {
rootfs, err := cm.cadvisorInterface.RootFsInfo()
if err != nil {
return fmt.Errorf("failed to get rootfs info: %v", err)
}
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
}
containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
err := cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap.Clone())
if err != nil {
return fmt.Errorf("start cpu manager error: %v", err)
}
// Initialize memory manager
err = cm.memoryManager.Start(memorymanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap.Clone())
if err != nil {
return fmt.Errorf("start memory manager error: %v", err)
}
}
// Starts device manager.
if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady, containerMap.Clone(), containerRunningSet); err != nil {
return err
}
return nil
}
// NewContainerManager creates windows container manager.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) {
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
machineInfo, err := cadvisorInterface.MachineInfo()
if err != nil {
return nil, err
}
capacity := cadvisor.CapacityFromMachineInfo(machineInfo)
cm := &containerManagerImpl{
capacity: capacity,
nodeConfig: nodeConfig,
cadvisorInterface: cadvisorInterface,
}
cm.topologyManager = topologymanager.NewFakeManager()
cm.cpuManager = cpumanager.NewFakeManager()
cm.memoryManager = memorymanager.NewFakeManager()
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
klog.InfoS("Creating topology manager")
cm.topologyManager, err = topologymanager.NewManager(machineInfo.Topology,
nodeConfig.TopologyManagerPolicy,
nodeConfig.TopologyManagerScope,
nodeConfig.TopologyManagerPolicyOptions)
if err != nil {
klog.ErrorS(err, "Failed to initialize topology manager")
return nil, err
}
klog.InfoS("Creating cpu manager")
cm.cpuManager, err = cpumanager.NewManager(
nodeConfig.CPUManagerPolicy,
nodeConfig.CPUManagerPolicyOptions,
nodeConfig.CPUManagerReconcilePeriod,
machineInfo,
nodeConfig.NodeAllocatableConfig.ReservedSystemCPUs,
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize cpu manager")
return nil, err
}
cm.topologyManager.AddHintProvider(cm.cpuManager)
klog.InfoS("Creating memory manager")
cm.memoryManager, err = memorymanager.NewManager(
nodeConfig.ExperimentalMemoryManagerPolicy,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.ExperimentalMemoryManagerReservedMemory,
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize memory manager")
return nil, err
}
cm.topologyManager.AddHintProvider(cm.memoryManager)
}
klog.InfoS("Creating device plugin manager")
cm.deviceManager, err = devicemanager.NewManagerImpl(nil, cm.topologyManager)
if err != nil {
return nil, err
}
cm.topologyManager.AddHintProvider(cm.deviceManager)
return cm, nil
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
cm.RLock()
defer cm.RUnlock()
return cm.nodeConfig
}
func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerImpl) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerImpl) Status() Status {
return Status{}
}
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.nodeConfig.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.nodeConfig.SystemReserved != nil {
value.Add(cm.nodeConfig.SystemReserved[k])
}
if cm.nodeConfig.KubeReserved != nil {
value.Add(cm.nodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
func (cm *containerManagerImpl) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
return cm.capacity
}
func (cm *containerManagerImpl) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
// DRA is not supported on Windows, only device plugin is supported
return map[string]cache.PluginHandler{pluginwatcherapi.DevicePlugin: cm.deviceManager.GetWatcherHandler()}
}
func (cm *containerManagerImpl) GetHealthCheckers() []healthz.HealthChecker {
return []healthz.HealthChecker{cm.deviceManager.GetHealthChecker()}
}
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
return cm.deviceManager.GetCapacity()
}
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerImpl) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{}
// Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here
devOpts, err := cm.deviceManager.GetDeviceRunContainerOptions(pod, container)
if err != nil {
return nil, err
} else if devOpts == nil {
return opts, nil
}
opts.Devices = append(opts.Devices, devOpts.Devices...)
opts.Mounts = append(opts.Mounts, devOpts.Mounts...)
opts.Envs = append(opts.Envs, devOpts.Envs...)
opts.Annotations = append(opts.Annotations, devOpts.Annotations...)
return opts, nil
}
func (cm *containerManagerImpl) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
// For now we only support Device Plugin
cm.deviceManager.UpdateAllocatedResourcesStatus(pod, status)
// TODO(SergeyKanzhelev, https://kep.k8s.io/4680): add support for DRA resources when DRA supports Windows
}
func (cm *containerManagerImpl) Updates() <-chan resourceupdates.Update {
// TODO(SergeyKanzhelev, https://kep.k8s.io/4680): add support for DRA resources, for now only use device plugin updates
return cm.deviceManager.Updates()
}
func (cm *containerManagerImpl) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return cm.deviceManager.UpdatePluginResources(node, attrs)
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager}
}
func (cm *containerManagerImpl) GetPodCgroupRoot() string {
return ""
}
func (cm *containerManagerImpl) GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices {
return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetDevices(podUID, containerName))
}
func (cm *containerManagerImpl) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerImpl) ShouldResetExtendedResourceCapacity() bool {
return cm.deviceManager.ShouldResetExtendedResourceCapacity()
}
func (cm *containerManagerImpl) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
return cm.topologyManager
}
func (cm *containerManagerImpl) UpdateAllocatedDevices() {
return
}
func (cm *containerManagerImpl) GetCPUs(podUID, containerName string) []int64 {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
if cm.cpuManager != nil {
return int64Slice(cm.cpuManager.GetExclusiveCPUs(podUID, containerName).UnsortedList())
}
return []int64{}
}
return nil
}
func (cm *containerManagerImpl) GetAllocatableCPUs() []int64 {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
if cm.cpuManager != nil {
return int64Slice(cm.cpuManager.GetAllocatableCPUs().UnsortedList())
}
return []int64{}
}
return nil
}
func (cm *containerManagerImpl) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}
func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *containerManagerImpl) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}

View File

@ -0,0 +1,90 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package containermap
import (
"fmt"
)
// cmItem (ContainerMap ITEM) is a pair podUID, containerName
type cmItem struct {
podUID string
containerName string
}
// ContainerMap maps (containerID)->(podUID, containerName)
type ContainerMap map[string]cmItem
// NewContainerMap creates a new ContainerMap struct
func NewContainerMap() ContainerMap {
return make(ContainerMap)
}
// Clone creates a deep copy of the ContainerMap
func (cm ContainerMap) Clone() ContainerMap {
ret := make(ContainerMap, len(cm))
for key, val := range cm {
ret[key] = val
}
return ret
}
// Add adds a mapping of (containerID)->(podUID, containerName) to the ContainerMap
func (cm ContainerMap) Add(podUID, containerName, containerID string) {
cm[containerID] = cmItem{
podUID: podUID,
containerName: containerName,
}
}
// RemoveByContainerID removes a mapping of (containerID)->(podUID, containerName) from the ContainerMap
func (cm ContainerMap) RemoveByContainerID(containerID string) {
delete(cm, containerID)
}
// RemoveByContainerRef removes a mapping of (containerID)->(podUID, containerName) from the ContainerMap
func (cm ContainerMap) RemoveByContainerRef(podUID, containerName string) {
containerID, err := cm.GetContainerID(podUID, containerName)
if err == nil {
cm.RemoveByContainerID(containerID)
}
}
// GetContainerID retrieves a ContainerID from the ContainerMap
func (cm ContainerMap) GetContainerID(podUID, containerName string) (string, error) {
for key, val := range cm {
if val.podUID == podUID && val.containerName == containerName {
return key, nil
}
}
return "", fmt.Errorf("container %s not in ContainerMap for pod %s", containerName, podUID)
}
// GetContainerRef retrieves a (podUID, containerName) pair from the ContainerMap
func (cm ContainerMap) GetContainerRef(containerID string) (string, string, error) {
if _, exists := cm[containerID]; !exists {
return "", "", fmt.Errorf("containerID %s not in ContainerMap", containerID)
}
return cm[containerID].podUID, cm[containerID].containerName, nil
}
// Visit invoke visitor function to walks all of the entries in the container map
func (cm ContainerMap) Visit(visitor func(podUID, containerName, containerID string)) {
for k, v := range cm {
visitor(v.podUID, v.containerName, k)
}
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- derekwaynecarr
reviewers:
- klueska
emeritus_approvers:
- balajismaniam
- ConnorDoyle
- vishh

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,525 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
"fmt"
"math"
"sync"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/utils/cpuset"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(ctx context.Context, id string, resources *runtimeapi.ContainerResources) error
}
type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error
// Called to trigger the allocation of CPUs to a container. This must be
// called at some point prior to the AddContainer() call for a container,
// e.g. at pod admission time.
Allocate(pod *v1.Pod, container *v1.Container) error
// AddContainer adds the mapping between container ID to pod UID and the container name
// The mapping used to remove the CPU allocation during the container removal
AddContainer(p *v1.Pod, c *v1.Container, containerID string)
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, the CPU manager stops trying to reconcile
// that container and any CPUs dedicated to the container are freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(*v1.Pod, *v1.Container) map[string][]topologymanager.TopologyHint
// GetExclusiveCPUs implements the podresources.CPUsProvider interface to provide
// exclusively allocated cpus for the container
GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment per Pod
// among this and other resource controllers.
GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
GetAllocatableCPUs() cpuset.CPUSet
// GetCPUAffinity returns cpuset which includes cpus from shared pools
// as well as exclusively allocated cpus
GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
// GetAllCPUs returns all the CPUs known by cpumanager, as reported by the
// hardware discovery. Maps to the CPU capacity.
GetAllCPUs() cpuset.CPUSet
}
type manager struct {
sync.Mutex
policy Policy
// reconcilePeriod is the duration between calls to reconcileState.
reconcilePeriod time.Duration
// state allows pluggable CPU assignment policies while sharing a common
// representation of state for the system to inspect and reconcile.
state state.State
// lastUpdatedstate holds state for each container from the last time it was updated.
lastUpdateState state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated in the reconciliation loop.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
// containerMap provides a mapping from (pod, container) -> containerID
// for all containers a pod
containerMap containermap.ContainerMap
topology *topology.CPUTopology
nodeAllocatableReservation v1.ResourceList
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config.SourcesReady
// stateFileDirectory holds the directory where the state file for checkpoints is held.
stateFileDirectory string
// allCPUs is the set of online CPUs as reported by the system
allCPUs cpuset.CPUSet
// allocatableCPUs is the set of online CPUs as reported by the system,
// and available for allocation, minus the reserved set
allocatableCPUs cpuset.CPUSet
}
var _ Manager = &manager{}
type sourcesReadyStub struct{}
func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
var topo *topology.CPUTopology
var policy Policy
var err error
topo, err = topology.Discover(machineInfo)
if err != nil {
return nil, err
}
switch policyName(cpuPolicyName) {
case PolicyNone:
policy, err = NewNonePolicy(cpuPolicyOptions)
if err != nil {
return nil, fmt.Errorf("new none policy error: %w", err)
}
case PolicyStatic:
klog.InfoS("Detected CPU topology", "topology", topo)
reservedCPUs, ok := nodeAllocatableReservation[v1.ResourceCPU]
if !ok {
// The static policy cannot initialize without this information.
return nil, fmt.Errorf("[cpumanager] unable to determine reserved CPU resources for static policy")
}
if reservedCPUs.IsZero() {
// The static policy requires this to be nonzero. Zero CPU reservation
// would allow the shared pool to be completely exhausted. At that point
// either we would violate our guarantee of exclusivity or need to evict
// any pod that has at least one container that requires zero CPUs.
// See the comments in policy_static.go for more details.
return nil, fmt.Errorf("[cpumanager] the static policy requires systemreserved.cpu + kubereserved.cpu to be greater than zero")
}
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions)
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
}
default:
return nil, fmt.Errorf("unknown policy: \"%s\"", cpuPolicyName)
}
manager := &manager{
policy: policy,
reconcilePeriod: reconcilePeriod,
lastUpdateState: state.NewMemoryState(),
topology: topo,
nodeAllocatableReservation: nodeAllocatableReservation,
stateFileDirectory: stateFileDirectory,
allCPUs: topo.CPUDetails.CPUs(),
}
manager.sourcesReady = &sourcesReadyStub{}
return manager, nil
}
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Starting CPU manager", "policy", m.policy.Name())
klog.InfoS("Reconciling", "reconcilePeriod", m.reconcilePeriod)
m.sourcesReady = sourcesReady
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.containerMap = initialContainers
stateImpl, err := state.NewCheckpointState(m.stateFileDirectory, cpuManagerStateFileName, m.policy.Name(), m.containerMap)
if err != nil {
klog.ErrorS(err, "Could not initialize checkpoint manager, please drain node and remove policy state file")
return err
}
m.state = stateImpl
err = m.policy.Start(m.state)
if err != nil {
klog.ErrorS(err, "Policy start error")
return err
}
m.allocatableCPUs = m.policy.GetAllocatableCPUs(m.state)
if m.policy.Name() == string(PolicyNone) {
return nil
}
// Periodically call m.reconcileState() to continue to keep the CPU sets of
// all pods in sync with and guaranteed CPUs handed out among them.
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
return nil
}
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
// Garbage collect any stranded resources before allocating CPUs.
m.removeStaleState()
m.Lock()
defer m.Unlock()
// Call down into the policy to assign this container CPUs if required.
err := m.policy.Allocate(m.state, p, c)
if err != nil {
klog.ErrorS(err, "Allocate error")
return err
}
return nil
}
func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
m.Lock()
defer m.Unlock()
if cset, exists := m.state.GetCPUSet(string(pod.UID), container.Name); exists {
m.lastUpdateState.SetCPUSet(string(pod.UID), container.Name, cset)
}
m.containerMap.Add(string(pod.UID), container.Name, containerID)
}
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
err := m.policyRemoveContainerByID(containerID)
if err != nil {
klog.ErrorS(err, "RemoveContainer error")
return err
}
return nil
}
func (m *manager) policyRemoveContainerByID(containerID string) error {
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
return nil
}
err = m.policy.RemoveContainer(m.state, podUID, containerName)
if err == nil {
m.lastUpdateState.Delete(podUID, containerName)
m.containerMap.RemoveByContainerID(containerID)
}
return err
}
func (m *manager) policyRemoveContainerByRef(podUID string, containerName string) error {
err := m.policy.RemoveContainer(m.state, podUID, containerName)
if err == nil {
m.lastUpdateState.Delete(podUID, containerName)
m.containerMap.RemoveByContainerRef(podUID, containerName)
}
return err
}
func (m *manager) State() state.Reader {
return m.state
}
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetTopologyHints(m.state, pod, container)
}
func (m *manager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetPodTopologyHints(m.state, pod)
}
func (m *manager) GetAllocatableCPUs() cpuset.CPUSet {
return m.allocatableCPUs.Clone()
}
func (m *manager) GetAllCPUs() cpuset.CPUSet {
return m.allCPUs.Clone()
}
type reconciledContainer struct {
podName string
containerName string
containerID string
}
func (m *manager) removeStaleState() {
// Only once all sources are ready do we attempt to remove any stale state.
// This ensures that the call to `m.activePods()` below will succeed with
// the actual active pods list.
if !m.sourcesReady.AllReady() {
return
}
// We grab the lock to ensure that no new containers will grab CPUs while
// executing the code below. Without this lock, its possible that we end up
// removing state that is newly added by an asynchronous call to
// AddContainer() during the execution of this code.
m.Lock()
defer m.Unlock()
// Get the list of active pods.
activePods := m.activePods()
// Build a list of (podUID, containerName) pairs for all containers in all active Pods.
activeContainers := make(map[string]map[string]struct{})
for _, pod := range activePods {
activeContainers[string(pod.UID)] = make(map[string]struct{})
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
activeContainers[string(pod.UID)][container.Name] = struct{}{}
}
}
// Loop through the CPUManager state. Remove any state for containers not
// in the `activeContainers` list built above.
assignments := m.state.GetCPUAssignments()
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; ok {
klog.V(5).InfoS("RemoveStaleState: container still active", "podUID", podUID, "containerName", containerName)
continue
}
klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName)
err := m.policyRemoveContainerByRef(podUID, containerName)
if err != nil {
klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName)
}
}
}
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; ok {
klog.V(5).InfoS("RemoveStaleState: containerMap: container still active", "podUID", podUID, "containerName", containerName)
return
}
klog.V(2).InfoS("RemoveStaleState: containerMap: removing container", "podUID", podUID, "containerName", containerName)
err := m.policyRemoveContainerByRef(podUID, containerName)
if err != nil {
klog.ErrorS(err, "RemoveStaleState: containerMap: failed to remove container", "podUID", podUID, "containerName", containerName)
}
})
}
func (m *manager) reconcileState() (success []reconciledContainer, failure []reconciledContainer) {
ctx := context.Background()
success = []reconciledContainer{}
failure = []reconciledContainer{}
m.removeStaleState()
for _, pod := range m.activePods() {
pstatus, ok := m.podStatusProvider.GetPodStatus(pod.UID)
if !ok {
klog.V(5).InfoS("ReconcileState: skipping pod; status not found", "pod", klog.KObj(pod))
failure = append(failure, reconciledContainer{pod.Name, "", ""})
continue
}
allContainers := pod.Spec.InitContainers
allContainers = append(allContainers, pod.Spec.Containers...)
for _, container := range allContainers {
containerID, err := findContainerIDByName(&pstatus, container.Name)
if err != nil {
klog.V(5).InfoS("ReconcileState: skipping container; ID not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
cstatus, err := findContainerStatusByName(&pstatus, container.Name)
if err != nil {
klog.V(5).InfoS("ReconcileState: skipping container; container status not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
if cstatus.State.Waiting != nil ||
(cstatus.State.Waiting == nil && cstatus.State.Running == nil && cstatus.State.Terminated == nil) {
klog.V(4).InfoS("ReconcileState: skipping container; container still in the waiting state", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
m.Lock()
if cstatus.State.Terminated != nil {
// The container is terminated but we can't call m.RemoveContainer()
// here because it could remove the allocated cpuset for the container
// which may be in the process of being restarted. That would result
// in the container losing any exclusively-allocated CPUs that it
// was allocated.
_, _, err := m.containerMap.GetContainerRef(containerID)
if err == nil {
klog.V(4).InfoS("ReconcileState: ignoring terminated container", "pod", klog.KObj(pod), "containerID", containerID)
}
m.Unlock()
continue
}
// Once we make it here we know we have a running container.
// Idempotently add it to the containerMap incase it is missing.
// This can happen after a kubelet restart, for example.
m.containerMap.Add(string(pod.UID), container.Name, containerID)
m.Unlock()
cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
if cset.IsEmpty() {
// NOTE: This should not happen outside of tests.
klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
lcset := m.lastUpdateState.GetCPUSetOrDefault(string(pod.UID), container.Name)
if !cset.Equals(lcset) {
klog.V(5).InfoS("ReconcileState: updating container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset)
err = m.updateContainerCPUSet(ctx, containerID, cset)
if err != nil {
klog.ErrorS(err, "ReconcileState: failed to update container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
m.lastUpdateState.SetCPUSet(string(pod.UID), container.Name, cset)
}
success = append(success, reconciledContainer{pod.Name, container.Name, containerID})
}
}
return success, failure
}
func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
allStatuses := status.InitContainerStatuses
allStatuses = append(allStatuses, status.ContainerStatuses...)
for _, container := range allStatuses {
if container.Name == name && container.ContainerID != "" {
cid := &kubecontainer.ContainerID{}
err := cid.ParseString(container.ContainerID)
if err != nil {
return "", err
}
return cid.ID, nil
}
}
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
}
func findContainerStatusByName(status *v1.PodStatus, name string) (*v1.ContainerStatus, error) {
for _, containerStatus := range append(status.InitContainerStatuses, status.ContainerStatuses...) {
if containerStatus.Name == name {
return &containerStatus, nil
}
}
return nil, fmt.Errorf("unable to find status for container with name %v in pod status (it may not be running)", name)
}
func (m *manager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
if result, ok := m.state.GetCPUSet(podUID, containerName); ok {
return result
}
return cpuset.CPUSet{}
}
func (m *manager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet {
return m.state.GetCPUSetOrDefault(podUID, containerName)
}

View File

@ -0,0 +1,43 @@
//go:build !windows
// +build !windows
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/utils/cpuset"
)
func (m *manager) updateContainerCPUSet(ctx context.Context, containerID string, cpus cpuset.CPUSet) error {
// TODO: Consider adding a `ResourceConfigForContainer` helper in
// helpers_linux.go similar to what exists for pods.
// It would be better to pass the full container resources here instead of
// this patch-like partial resources.
return m.containerRuntime.UpdateContainerResources(
ctx,
containerID,
&runtimeapi.ContainerResources{
Linux: &runtimeapi.LinuxContainerResources{
CpusetCpus: cpus.String(),
},
})
}

View File

@ -0,0 +1,49 @@
//go:build windows
// +build windows
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/winstats"
"k8s.io/utils/cpuset"
)
func (m *manager) updateContainerCPUSet(ctx context.Context, containerID string, cpus cpuset.CPUSet) error {
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
return nil
}
affinities := winstats.CpusToGroupAffinity(cpus.List())
var cpuGroupAffinities []*runtimeapi.WindowsCpuGroupAffinity
for _, affinity := range affinities {
cpuGroupAffinities = append(cpuGroupAffinities, &runtimeapi.WindowsCpuGroupAffinity{
CpuGroup: uint32(affinity.Group),
CpuMask: uint64(affinity.Mask),
})
}
return m.containerRuntime.UpdateContainerResources(ctx, containerID, &runtimeapi.ContainerResources{
Windows: &runtimeapi.WindowsContainerResources{
AffinityCpus: cpuGroupAffinities,
},
})
}

View File

@ -0,0 +1,98 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/utils/cpuset"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Start()")
return nil
}
func (m *fakeManager) Policy() Policy {
klog.InfoS("Policy()")
pol, _ := NewNonePolicy(nil)
return pol
}
func (m *fakeManager) Allocate(pod *v1.Pod, container *v1.Container) error {
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
klog.InfoS("AddContainer", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID)
}
func (m *fakeManager) RemoveContainer(containerID string) error {
klog.InfoS("RemoveContainer", "containerID", containerID)
return nil
}
func (m *fakeManager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get container topology hints")
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get pod topology hints")
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) State() state.Reader {
return m.state
}
func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
}
func (m *fakeManager) GetAllocatableCPUs() cpuset.CPUSet {
klog.InfoS("Get Allocatable CPUs")
return cpuset.CPUSet{}
}
func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetCPUAffinity", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
}
func (m *fakeManager) GetAllCPUs() cpuset.CPUSet {
klog.InfoS("GetAllCPUs")
return cpuset.CPUSet{}
}
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View File

@ -0,0 +1,45 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/utils/cpuset"
)
// Policy implements logic for pod container to CPU assignment.
type Policy interface {
Name() string
Start(s state.State) error
// Allocate call is idempotent
Allocate(s state.State, pod *v1.Pod, container *v1.Container) error
// RemoveContainer call is idempotent
RemoveContainer(s state.State, podUID string, containerName string) error
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment per Pod
// among this and other resource controllers.
GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
GetAllocatableCPUs(m state.State) cpuset.CPUSet
}

View File

@ -0,0 +1,76 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/utils/cpuset"
)
type nonePolicy struct{}
var _ Policy = &nonePolicy{}
// PolicyNone name of none policy
const PolicyNone policyName = "none"
// NewNonePolicy returns a cpuset manager policy that does nothing
func NewNonePolicy(cpuPolicyOptions map[string]string) (Policy, error) {
if len(cpuPolicyOptions) > 0 {
return nil, fmt.Errorf("None policy: received unsupported options=%v", cpuPolicyOptions)
}
return &nonePolicy{}, nil
}
func (p *nonePolicy) Name() string {
return string(PolicyNone)
}
func (p *nonePolicy) Start(s state.State) error {
klog.InfoS("None policy: Start")
return nil
}
func (p *nonePolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
return nil
}
func (p *nonePolicy) RemoveContainer(s state.State, podUID string, containerName string) error {
return nil
}
func (p *nonePolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return nil
}
func (p *nonePolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return nil
}
// Assignable CPUs are the ones that can be exclusively allocated to pods that meet the exclusivity requirement
// (ie guaranteed QoS class and integral CPU request).
// Assignability of CPUs as a concept is only applicable in case of static policy i.e. scenarios where workloads
// CAN get exclusive access to core(s).
// Hence, we return empty set here: no cpus are assignable according to above definition with this policy.
func (p *nonePolicy) GetAllocatableCPUs(m state.State) cpuset.CPUSet {
return cpuset.New()
}

View File

@ -0,0 +1,185 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"strconv"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// Names of the options, as part of the user interface.
const (
FullPCPUsOnlyOption string = "full-pcpus-only"
DistributeCPUsAcrossNUMAOption string = "distribute-cpus-across-numa"
AlignBySocketOption string = "align-by-socket"
DistributeCPUsAcrossCoresOption string = "distribute-cpus-across-cores"
StrictCPUReservationOption string = "strict-cpu-reservation"
PreferAlignByUnCoreCacheOption string = "prefer-align-cpus-by-uncorecache"
)
var (
alphaOptions = sets.New[string](
DistributeCPUsAcrossNUMAOption,
AlignBySocketOption,
DistributeCPUsAcrossCoresOption,
StrictCPUReservationOption,
PreferAlignByUnCoreCacheOption,
)
betaOptions = sets.New[string](
FullPCPUsOnlyOption,
)
stableOptions = sets.New[string]()
)
// CheckPolicyOptionAvailable verifies if the given option can be used depending on the Feature Gate Settings.
// returns nil on success, or an error describing the failure on error.
func CheckPolicyOptionAvailable(option string) error {
if !alphaOptions.Has(option) && !betaOptions.Has(option) && !stableOptions.Has(option) {
return fmt.Errorf("unknown CPU Manager Policy option: %q", option)
}
if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManagerPolicyAlphaOptions) {
return fmt.Errorf("CPU Manager Policy Alpha-level Options not enabled, but option %q provided", option)
}
if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManagerPolicyBetaOptions) {
return fmt.Errorf("CPU Manager Policy Beta-level Options not enabled, but option %q provided", option)
}
return nil
}
// StaticPolicyOptions holds the parsed value of the policy options, ready to be consumed internally.
type StaticPolicyOptions struct {
// flag to enable extra allocation restrictions to avoid
// different containers to possibly end up on the same core.
// we consider "core" and "physical CPU" synonim here, leaning
// towards the terminoloy k8s hints. We acknowledge this is confusing.
//
// looking at https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/,
// any possible naming scheme will lead to ambiguity to some extent.
// We picked "pcpu" because it the established docs hints at vCPU already.
FullPhysicalCPUsOnly bool
// Flag to evenly distribute CPUs across NUMA nodes in cases where more
// than one NUMA node is required to satisfy the allocation.
DistributeCPUsAcrossNUMA bool
// Flag to ensure CPUs are considered aligned at socket boundary rather than
// NUMA boundary
AlignBySocket bool
// flag to enable extra allocation restrictions to spread
// cpus (HT) on different physical core.
// This is a preferred policy so do not throw error if they have to packed in one physical core.
DistributeCPUsAcrossCores bool
// Flag to remove reserved cores from the list of available cores
StrictCPUReservation bool
// Flag that makes best-effort to align CPUs to a uncorecache boundary
// As long as there are CPUs available, pods will be admitted if the condition is not met.
PreferAlignByUncoreCacheOption bool
}
// NewStaticPolicyOptions creates a StaticPolicyOptions struct from the user configuration.
func NewStaticPolicyOptions(policyOptions map[string]string) (StaticPolicyOptions, error) {
opts := StaticPolicyOptions{}
for name, value := range policyOptions {
if err := CheckPolicyOptionAvailable(name); err != nil {
return opts, err
}
switch name {
case FullPCPUsOnlyOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.FullPhysicalCPUsOnly = optValue
case DistributeCPUsAcrossNUMAOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.DistributeCPUsAcrossNUMA = optValue
case AlignBySocketOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.AlignBySocket = optValue
case DistributeCPUsAcrossCoresOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.DistributeCPUsAcrossCores = optValue
case StrictCPUReservationOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.StrictCPUReservation = optValue
case PreferAlignByUnCoreCacheOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.PreferAlignByUncoreCacheOption = optValue
default:
// this should never be reached, we already detect unknown options,
// but we keep it as further safety.
return opts, fmt.Errorf("unsupported cpumanager option: %q (%s)", name, value)
}
}
if opts.FullPhysicalCPUsOnly && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", FullPCPUsOnlyOption, DistributeCPUsAcrossCoresOption)
}
// TODO(@Jeffwan): Remove this check after more compatibility tests are done.
if opts.DistributeCPUsAcrossNUMA && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", DistributeCPUsAcrossNUMAOption, DistributeCPUsAcrossCoresOption)
}
if opts.PreferAlignByUncoreCacheOption && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", PreferAlignByUnCoreCacheOption, DistributeCPUsAcrossCoresOption)
}
if opts.PreferAlignByUncoreCacheOption && opts.DistributeCPUsAcrossNUMA {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", PreferAlignByUnCoreCacheOption, DistributeCPUsAcrossNUMAOption)
}
return opts, nil
}
// ValidateStaticPolicyOptions ensures that the requested policy options are compatible with the machine on which the CPUManager is running.
func ValidateStaticPolicyOptions(opts StaticPolicyOptions, topology *topology.CPUTopology, topologyManager topologymanager.Store) error {
if opts.AlignBySocket {
// Not compatible with topology manager single-numa-node policy option.
if topologyManager.GetPolicy().Name() == topologymanager.PolicySingleNumaNode {
return fmt.Errorf("Topolgy manager %s policy is incompatible with CPUManager %s policy option", topologymanager.PolicySingleNumaNode, AlignBySocketOption)
}
// Not compatible with topology when number of sockets are more than number of NUMA nodes.
if topology.NumSockets > topology.NumNUMANodes {
return fmt.Errorf("Align by socket is not compatible with hardware where number of sockets are more than number of NUMA")
}
}
return nil
}

View File

@ -0,0 +1,766 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/utils/cpuset"
)
const (
// PolicyStatic is the name of the static policy.
// Should options be given, these will be ignored and backward (up to 1.21 included)
// compatible behaviour will be enforced
PolicyStatic policyName = "static"
// ErrorSMTAlignment represents the type of an SMTAlignmentError
ErrorSMTAlignment = "SMTAlignmentError"
)
// SMTAlignmentError represents an error due to SMT alignment
type SMTAlignmentError struct {
RequestedCPUs int
CpusPerCore int
AvailablePhysicalCPUs int
CausedByPhysicalCPUs bool
}
func (e SMTAlignmentError) Error() string {
if e.CausedByPhysicalCPUs {
return fmt.Sprintf("SMT Alignment Error: not enough free physical CPUs: available physical CPUs = %d, requested CPUs = %d, CPUs per core = %d", e.AvailablePhysicalCPUs, e.RequestedCPUs, e.CpusPerCore)
}
return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore)
}
// Type returns human-readable type of this error. Used in the admission control to populate Admission Failure reason.
func (e SMTAlignmentError) Type() string {
return ErrorSMTAlignment
}
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
//
// This policy allocates CPUs exclusively for a container if all the following
// conditions are met:
//
// - The pod QoS class is Guaranteed.
// - The CPU request is a positive integer.
//
// The static policy maintains the following sets of logical CPUs:
//
// - SHARED: Burstable, BestEffort, and non-integral Guaranteed containers
// run here. Initially this contains all CPU IDs on the system. As
// exclusive allocations are created and destroyed, this CPU set shrinks
// and grows, accordingly. This is stored in the state as the default
// CPU set.
//
// - RESERVED: A subset of the shared pool which is not exclusively
// allocatable. The membership of this pool is static for the lifetime of
// the Kubelet. The size of the reserved pool is
// ceil(systemreserved.cpu + kubereserved.cpu).
// Reserved CPUs are taken topologically starting with lowest-indexed
// physical core, as reported by cAdvisor.
//
// - ASSIGNABLE: Equal to SHARED - RESERVED. Exclusive CPUs are allocated
// from this pool.
//
// - EXCLUSIVE ALLOCATIONS: CPU sets assigned exclusively to one container.
// These are stored as explicit assignments in the state.
//
// When an exclusive allocation is made, the static policy also updates the
// default cpuset in the state abstraction. The CPU manager's periodic
// reconcile loop takes care of rewriting the cpuset in cgroupfs for any
// containers that may be running in the shared pool. For this reason,
// applications running within exclusively-allocated containers must tolerate
// potentially sharing their allocated CPUs for up to the CPU manager
// reconcile period.
type staticPolicy struct {
// cpu socket topology
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reservedCPUs cpuset.CPUSet
// Superset of reservedCPUs. It includes not just the reservedCPUs themselves,
// but also any siblings of those reservedCPUs on the same physical die.
// NOTE: If the reserved set includes full physical CPUs from the beginning
// (e.g. only reserved pairs of core siblings) this set is expected to be
// identical to the reserved set.
reservedPhysicalCPUs cpuset.CPUSet
// topology manager reference to get container Topology affinity
affinity topologymanager.Store
// set of CPUs to reuse across allocations in a pod
cpusToReuse map[string]cpuset.CPUSet
// options allow to fine-tune the behaviour of the policy
options StaticPolicyOptions
// we compute this value multiple time, and it's not supposed to change
// at runtime - the cpumanager can't deal with runtime topology changes anyway.
cpuGroupSize int
}
// Ensure staticPolicy implements Policy interface
var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) {
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
}
err = ValidateStaticPolicyOptions(opts, topology, affinity)
if err != nil {
return nil, err
}
cpuGroupSize := topology.CPUsPerCore()
klog.InfoS("Static policy created with configuration", "options", opts, "cpuGroupSize", cpuGroupSize)
policy := &staticPolicy{
topology: topology,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
cpuGroupSize: cpuGroupSize,
}
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
reserved = reservedCPUs
} else {
// takeByTopology allocates CPUs associated with low-numbered cores from
// allCPUs.
//
// For example: Given a system with 8 CPUs available and HT enabled,
// if numReservedCPUs=2, then reserved={0,4}
reserved, _ = policy.takeByTopology(allCPUs, numReservedCPUs)
}
if reserved.Size() != numReservedCPUs {
err := fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs)
return nil, err
}
var reservedPhysicalCPUs cpuset.CPUSet
for _, cpu := range reserved.UnsortedList() {
core, err := topology.CPUCoreID(cpu)
if err != nil {
return nil, fmt.Errorf("[cpumanager] unable to build the reserved physical CPUs from the reserved set: %w", err)
}
reservedPhysicalCPUs = reservedPhysicalCPUs.Union(topology.CPUDetails.CPUsInCores(core))
}
klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved, "reservedPhysicalCPUs", reservedPhysicalCPUs)
policy.reservedCPUs = reserved
policy.reservedPhysicalCPUs = reservedPhysicalCPUs
return policy, nil
}
func (p *staticPolicy) Name() string {
return string(PolicyStatic)
}
func (p *staticPolicy) Start(s state.State) error {
if err := p.validateState(s); err != nil {
klog.ErrorS(err, "Static policy invalid state, please drain node and remove policy state file")
return err
}
p.initializeMetrics(s)
return nil
}
func (p *staticPolicy) validateState(s state.State) error {
tmpAssignments := s.GetCPUAssignments()
tmpDefaultCPUset := s.GetDefaultCPUSet()
allCPUs := p.topology.CPUDetails.CPUs()
if p.options.StrictCPUReservation {
allCPUs = allCPUs.Difference(p.reservedCPUs)
}
// Default cpuset cannot be empty when assignments exist
if tmpDefaultCPUset.IsEmpty() {
if len(tmpAssignments) != 0 {
return fmt.Errorf("default cpuset cannot be empty")
}
// state is empty initialize
s.SetDefaultCPUSet(allCPUs)
klog.InfoS("Static policy initialized", "defaultCPUSet", allCPUs)
return nil
}
// State has already been initialized from file (is not empty)
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
if p.options.StrictCPUReservation {
if !p.reservedCPUs.Intersection(tmpDefaultCPUset).IsEmpty() {
return fmt.Errorf("some of strictly reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
p.reservedCPUs.Intersection(tmpDefaultCPUset).String(), tmpDefaultCPUset.String())
}
} else {
if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) {
return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
p.reservedCPUs.String(), tmpDefaultCPUset.String())
}
}
// 2. Check if state for static policy is consistent
for pod := range tmpAssignments {
for container, cset := range tmpAssignments[pod] {
// None of the cpu in DEFAULT cset should be in s.assignments
if !tmpDefaultCPUset.Intersection(cset).IsEmpty() {
return fmt.Errorf("pod: %s, container: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"",
pod, container, cset.String(), tmpDefaultCPUset.String())
}
}
}
// 3. It's possible that the set of available CPUs has changed since
// the state was written. This can be due to for example
// offlining a CPU when kubelet is not running. If this happens,
// CPU manager will run into trouble when later it tries to
// assign non-existent CPUs to containers. Validate that the
// topology that was received during CPU manager startup matches with
// the set of CPUs stored in the state.
totalKnownCPUs := tmpDefaultCPUset.Clone()
tmpCPUSets := []cpuset.CPUSet{}
for pod := range tmpAssignments {
for _, cset := range tmpAssignments[pod] {
tmpCPUSets = append(tmpCPUSets, cset)
}
}
totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...)
if !totalKnownCPUs.Equals(allCPUs) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
allCPUs.String(), totalKnownCPUs.String())
}
return nil
}
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
func (p *staticPolicy) GetAllocatableCPUs(s state.State) cpuset.CPUSet {
return p.topology.CPUDetails.CPUs().Difference(p.reservedCPUs)
}
// GetAvailableCPUs returns the set of unassigned CPUs minus the reserved set.
func (p *staticPolicy) GetAvailableCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reservedCPUs)
}
func (p *staticPolicy) GetAvailablePhysicalCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reservedPhysicalCPUs)
}
func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, cset cpuset.CPUSet) {
// If pod entries to m.cpusToReuse other than the current pod exist, delete them.
for podUID := range p.cpusToReuse {
if podUID != string(pod.UID) {
delete(p.cpusToReuse, podUID)
}
}
// If no cpuset exists for cpusToReuse by this pod yet, create one.
if _, ok := p.cpusToReuse[string(pod.UID)]; !ok {
p.cpusToReuse[string(pod.UID)] = cpuset.New()
}
// Check if the container is an init container.
// If so, add its cpuset to the cpuset of reusable CPUs for any new allocations.
for _, initContainer := range pod.Spec.InitContainers {
if container.Name == initContainer.Name {
if podutil.IsRestartableInitContainer(&initContainer) {
// If the container is a restartable init container, we should not
// reuse its cpuset, as a restartable init container can run with
// regular containers.
break
}
p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Union(cset)
return
}
}
// Otherwise it is an app container.
// Remove its cpuset from the cpuset of reusable CPUs for any new allocations.
p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Difference(cset)
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
klog.InfoS("Static policy: Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
// container belongs in an exclusively allocated pool
metrics.CPUManagerPinningRequestsTotal.Inc()
defer func() {
if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc()
return
}
if !p.options.FullPhysicalCPUsOnly {
// increment only if we know we allocate aligned resources
return
}
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}()
if p.options.FullPhysicalCPUsOnly {
if (numCPUs % p.cpuGroupSize) != 0 {
// Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
// if the CPU requested is a multiple of the number of virtual cpus per physical cores.
// In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put
// in Failed state, with SMTAlignmentError as reason. Since the allocation happens in terms of physical cores
// and the scheduler is responsible for ensuring that the workload goes to a node that has enough CPUs,
// the pod would be placed on a node where there are enough physical cores available to be allocated.
// Just like the behaviour in case of static policy, takeByTopology will try to first allocate CPUs from the same socket
// and only in case the request cannot be sattisfied on a single socket, CPU allocation is done for a workload to occupy all
// CPUs on a physical core. Allocation of individual threads would never have to occur.
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: p.cpuGroupSize,
CausedByPhysicalCPUs: false,
}
}
availablePhysicalCPUs := p.GetAvailablePhysicalCPUs(s).Size()
// It's legal to reserve CPUs which are not core siblings. In this case the CPU allocator can descend to single cores
// when picking CPUs. This will void the guarantee of FullPhysicalCPUsOnly. To prevent this, we need to additionally consider
// all the core siblings of the reserved CPUs as unavailable when computing the free CPUs, before to start the actual allocation.
// This way, by construction all possible CPUs allocation whose number is multiple of the SMT level are now correct again.
if numCPUs > availablePhysicalCPUs {
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: p.cpuGroupSize,
AvailablePhysicalCPUs: availablePhysicalCPUs,
CausedByPhysicalCPUs: true,
}
}
}
if cpuset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
p.updateCPUsToReuse(pod, container, cpuset)
klog.InfoS("Static policy: container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
// Call Topology Manager to get the aligned socket affinity across all hint providers.
hint := p.affinity.GetAffinity(string(pod.UID), container.Name)
klog.InfoS("Topology Affinity", "pod", klog.KObj(pod), "containerName", container.Name, "affinity", hint)
// Allocate CPUs according to the NUMA affinity contained in the hint.
cpuset, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
if err != nil {
klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs)
return err
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
p.updateMetricsOnAllocate(cpuset)
return nil
}
// getAssignedCPUsOfSiblings returns assigned cpus of given container's siblings(all containers other than the given container) in the given pod `podUID`.
func getAssignedCPUsOfSiblings(s state.State, podUID string, containerName string) cpuset.CPUSet {
assignments := s.GetCPUAssignments()
cset := cpuset.New()
for name, cpus := range assignments[podUID] {
if containerName == name {
continue
}
cset = cset.Union(cpus)
}
return cset
}
func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) error {
klog.InfoS("Static policy: RemoveContainer", "podUID", podUID, "containerName", containerName)
cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
// Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
p.updateMetricsOnRelease(toRelease)
}
return nil
}
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (cpuset.CPUSet, error) {
klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "socket", numaAffinity)
allocatableCPUs := p.GetAvailableCPUs(s).Union(reusableCPUs)
// If there are aligned CPUs in numaAffinity, attempt to take those first.
result := cpuset.New()
if numaAffinity != nil {
alignedCPUs := p.getAlignedCPUs(numaAffinity, allocatableCPUs)
numAlignedToAlloc := alignedCPUs.Size()
if numCPUs < numAlignedToAlloc {
numAlignedToAlloc = numCPUs
}
alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
if err != nil {
return cpuset.New(), err
}
result = result.Union(alignedCPUs)
}
// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size())
if err != nil {
return cpuset.New(), err
}
result = result.Union(remainingCPUs)
// Remove allocated CPUs from the shared CPUSet.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
klog.InfoS("AllocateCPUs", "result", result)
return result, nil
}
func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return 0
}
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
// In-place pod resize feature makes Container.Resources field mutable for CPU & memory.
// AllocatedResources holds the value of Container.Resources.Requests when the pod was admitted.
// We should return this value because this is what kubelet agreed to allocate for the container
// and the value configured with runtime.
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
cpuQuantity = cs.AllocatedResources[v1.ResourceCPU]
}
}
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
return int(cpuQuantity.Value())
}
func (p *staticPolicy) podGuaranteedCPUs(pod *v1.Pod) int {
// The maximum of requested CPUs by init containers.
requestedByInitContainers := 0
requestedByRestartableInitContainers := 0
for _, container := range pod.Spec.InitContainers {
if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
continue
}
requestedCPU := p.guaranteedCPUs(pod, &container)
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/753-sidecar-containers#resources-calculation-for-scheduling-and-pod-admission
// for the detail.
if podutil.IsRestartableInitContainer(&container) {
requestedByRestartableInitContainers += requestedCPU
} else if requestedByRestartableInitContainers+requestedCPU > requestedByInitContainers {
requestedByInitContainers = requestedByRestartableInitContainers + requestedCPU
}
}
// The sum of requested CPUs by app containers.
requestedByAppContainers := 0
for _, container := range pod.Spec.Containers {
if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
continue
}
requestedByAppContainers += p.guaranteedCPUs(pod, &container)
}
requestedByLongRunningContainers := requestedByAppContainers + requestedByRestartableInitContainers
if requestedByInitContainers > requestedByLongRunningContainers {
return requestedByInitContainers
}
return requestedByLongRunningContainers
}
func (p *staticPolicy) takeByTopology(availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
cpuSortingStrategy := CPUSortingStrategyPacked
if p.options.DistributeCPUsAcrossCores {
cpuSortingStrategy = CPUSortingStrategySpread
}
if p.options.DistributeCPUsAcrossNUMA {
cpuGroupSize := 1
if p.options.FullPhysicalCPUsOnly {
cpuGroupSize = p.cpuGroupSize
}
return takeByTopologyNUMADistributed(p.topology, availableCPUs, numCPUs, cpuGroupSize, cpuSortingStrategy)
}
return takeByTopologyNUMAPacked(p.topology, availableCPUs, numCPUs, cpuSortingStrategy, p.options.PreferAlignByUncoreCacheOption)
}
func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Get a count of how many guaranteed CPUs have been requested.
requested := p.guaranteedCPUs(pod, container)
// Number of required CPUs is not an integer or a container is not part of the Guaranteed QoS class.
// It will be treated by the TopologyManager as having no preference and cause it to ignore this
// resource when considering pod alignment.
// In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true].
if requested == 0 {
return nil
}
// Short circuit to regenerate the same hints if there are already
// guaranteed CPUs allocated to the Container. This might happen after a
// kubelet restart, for example.
if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists {
if allocated.Size() != requested {
klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "requestedSize", requested, "allocatedSize", allocated.Size())
// An empty list of hints will be treated as a preference that cannot be satisfied.
// In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false].
// For all but the best-effort policy, the Topology Manager will throw a pod-admission error.
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): {},
}
}
klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod), "containerName", container.Name)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): p.generateCPUTopologyHints(allocated, cpuset.CPUSet{}, requested),
}
}
// Get a list of available CPUs.
available := p.GetAvailableCPUs(s)
// Get a list of reusable CPUs (e.g. CPUs reused from initContainers).
// It should be an empty CPUSet for a newly created pod.
reusable := p.cpusToReuse[string(pod.UID)]
// Generate hints.
cpuHints := p.generateCPUTopologyHints(available, reusable, requested)
klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "containerName", container.Name, "cpuHints", cpuHints)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): cpuHints,
}
}
func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Get a count of how many guaranteed CPUs have been requested by Pod.
requested := p.podGuaranteedCPUs(pod)
// Number of required CPUs is not an integer or a pod is not part of the Guaranteed QoS class.
// It will be treated by the TopologyManager as having no preference and cause it to ignore this
// resource when considering pod alignment.
// In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true].
if requested == 0 {
return nil
}
assignedCPUs := cpuset.New()
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
requestedByContainer := p.guaranteedCPUs(pod, &container)
// Short circuit to regenerate the same hints if there are already
// guaranteed CPUs allocated to the Container. This might happen after a
// kubelet restart, for example.
if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists {
if allocated.Size() != requestedByContainer {
klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "allocatedSize", requested, "requestedByContainer", requestedByContainer, "allocatedSize", allocated.Size())
// An empty list of hints will be treated as a preference that cannot be satisfied.
// In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false].
// For all but the best-effort policy, the Topology Manager will throw a pod-admission error.
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): {},
}
}
// A set of CPUs already assigned to containers in this pod
assignedCPUs = assignedCPUs.Union(allocated)
}
}
if assignedCPUs.Size() == requested {
klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod))
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): p.generateCPUTopologyHints(assignedCPUs, cpuset.CPUSet{}, requested),
}
}
// Get a list of available CPUs.
available := p.GetAvailableCPUs(s)
// Get a list of reusable CPUs (e.g. CPUs reused from initContainers).
// It should be an empty CPUSet for a newly created pod.
reusable := p.cpusToReuse[string(pod.UID)]
// Ensure any CPUs already assigned to containers in this pod are included as part of the hint generation.
reusable = reusable.Union(assignedCPUs)
// Generate hints.
cpuHints := p.generateCPUTopologyHints(available, reusable, requested)
klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "cpuHints", cpuHints)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): cpuHints,
}
}
// generateCPUTopologyHints generates a set of TopologyHints given the set of
// available CPUs and the number of CPUs being requested.
//
// It follows the convention of marking all hints that have the same number of
// bits set as the narrowest matching NUMANodeAffinity with 'Preferred: true', and
// marking all others with 'Preferred: false'.
func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reusableCPUs cpuset.CPUSet, request int) []topologymanager.TopologyHint {
// Initialize minAffinitySize to include all NUMA Nodes.
minAffinitySize := p.topology.CPUDetails.NUMANodes().Size()
// Iterate through all combinations of numa nodes bitmask and build hints from them.
hints := []topologymanager.TopologyHint{}
bitmask.IterateBitMasks(p.topology.CPUDetails.NUMANodes().List(), func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
cpusInMask := p.topology.CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size()
if cpusInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if we have enough CPUs available on the current
// numa node bitmask to satisfy the CPU request.
numMatching := 0
for _, c := range reusableCPUs.List() {
// Disregard this mask if its NUMANode isn't part of it.
if !mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) {
return
}
numMatching++
}
// Finally, check to see if enough available CPUs remain on the current
// NUMA node combination to satisfy the CPU request.
for _, c := range availableCPUs.List() {
if mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the numa node bitmask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, topologymanager.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinitySize. Only those with an equal number of bits set (and
// with a minimal set of numa nodes) will be considered preferred.
for i := range hints {
if p.options.AlignBySocket && p.isHintSocketAligned(hints[i], minAffinitySize) {
hints[i].Preferred = true
continue
}
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
numaNodesPerSocket := p.topology.NumNUMANodes / p.topology.NumSockets
if numaNodesPerSocket == 0 {
return false
}
// minSockets refers to minimum number of socket required to satify allocation.
// A hint is considered socket aligned if sockets across which numa nodes span is equal to minSockets
minSockets := (minAffinitySize + numaNodesPerSocket - 1) / numaNodesPerSocket
return p.topology.CPUDetails.SocketsInNUMANodes(numaNodesBitMask...).Size() == minSockets
}
// getAlignedCPUs return set of aligned CPUs based on numa affinity mask and configured policy options.
func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableCPUs cpuset.CPUSet) cpuset.CPUSet {
alignedCPUs := cpuset.New()
numaBits := numaAffinity.GetBits()
// If align-by-socket policy option is enabled, NUMA based hint is expanded to
// socket aligned hint. It will ensure that first socket aligned available CPUs are
// allocated before we try to find CPUs across socket to satisfy allocation request.
if p.options.AlignBySocket {
socketBits := p.topology.CPUDetails.SocketsInNUMANodes(numaBits...).UnsortedList()
for _, socketID := range socketBits {
alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInSockets(socketID)))
}
return alignedCPUs
}
for _, numaNodeID := range numaBits {
alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInNUMANodes(numaNodeID)))
}
return alignedCPUs
}
func (p *staticPolicy) initializeMetrics(s state.State) {
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
}
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
}
func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) {
ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
}
func countExclusiveCPUs(s state.State) int {
exclusiveCPUs := 0
for _, cpuAssign := range s.GetCPUAssignments() {
for _, cset := range cpuAssign {
exclusiveCPUs += cset.Size()
}
}
return exclusiveCPUs
}

View File

@ -0,0 +1,135 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"fmt"
"hash/fnv"
"strings"
"k8s.io/apimachinery/pkg/util/dump"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV1{}
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV2{}
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpoint{}
// CPUManagerCheckpoint struct is used to store cpu/pod assignments in a checkpoint in v2 format
type CPUManagerCheckpoint struct {
PolicyName string `json:"policyName"`
DefaultCPUSet string `json:"defaultCpuSet"`
Entries map[string]map[string]string `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// CPUManagerCheckpointV1 struct is used to store cpu/pod assignments in a checkpoint in v1 format
type CPUManagerCheckpointV1 struct {
PolicyName string `json:"policyName"`
DefaultCPUSet string `json:"defaultCpuSet"`
Entries map[string]string `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// CPUManagerCheckpointV2 struct is used to store cpu/pod assignments in a checkpoint in v2 format
type CPUManagerCheckpointV2 = CPUManagerCheckpoint
// NewCPUManagerCheckpoint returns an instance of Checkpoint
func NewCPUManagerCheckpoint() *CPUManagerCheckpoint {
//nolint:staticcheck // unexported-type-in-api user-facing error message
return newCPUManagerCheckpointV2()
}
func newCPUManagerCheckpointV1() *CPUManagerCheckpointV1 {
return &CPUManagerCheckpointV1{
Entries: make(map[string]string),
}
}
func newCPUManagerCheckpointV2() *CPUManagerCheckpointV2 {
return &CPUManagerCheckpointV2{
Entries: make(map[string]map[string]string),
}
}
// MarshalCheckpoint returns marshalled checkpoint in v1 format
func (cp *CPUManagerCheckpointV1) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
cp.Checksum = 0
cp.Checksum = checksum.New(cp)
return json.Marshal(*cp)
}
// MarshalCheckpoint returns marshalled checkpoint in v2 format
func (cp *CPUManagerCheckpointV2) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
cp.Checksum = 0
cp.Checksum = checksum.New(cp)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint in v1 format
func (cp *CPUManagerCheckpointV1) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint in v2 format
func (cp *CPUManagerCheckpointV2) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that current checksum of checkpoint is valid in v1 format
func (cp *CPUManagerCheckpointV1) VerifyChecksum() error {
if cp.Checksum == 0 {
// accept empty checksum for compatibility with old file backend
return nil
}
ck := cp.Checksum
cp.Checksum = 0
object := dump.ForHash(cp)
object = strings.Replace(object, "CPUManagerCheckpointV1", "CPUManagerCheckpoint", 1)
cp.Checksum = ck
hash := fnv.New32a()
fmt.Fprintf(hash, "%v", object)
actualCS := checksum.Checksum(hash.Sum32())
if cp.Checksum != actualCS {
return &errors.CorruptCheckpointError{
ActualCS: uint64(actualCS),
ExpectedCS: uint64(cp.Checksum),
}
}
return nil
}
// VerifyChecksum verifies that current checksum of checkpoint is valid in v2 format
func (cp *CPUManagerCheckpointV2) VerifyChecksum() error {
if cp.Checksum == 0 {
// accept empty checksum for compatibility with old file backend
return nil
}
ck := cp.Checksum
cp.Checksum = 0
err := ck.Verify(cp)
cp.Checksum = ck
return err
}

View File

@ -0,0 +1,58 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/utils/cpuset"
)
// ContainerCPUAssignments type used in cpu manager state
type ContainerCPUAssignments map[string]map[string]cpuset.CPUSet
// Clone returns a copy of ContainerCPUAssignments
func (as ContainerCPUAssignments) Clone() ContainerCPUAssignments {
ret := make(ContainerCPUAssignments, len(as))
for pod := range as {
ret[pod] = make(map[string]cpuset.CPUSet, len(as[pod]))
for container, cset := range as[pod] {
ret[pod][container] = cset
}
}
return ret
}
// Reader interface used to read current cpu/pod assignment state
type Reader interface {
GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool)
GetDefaultCPUSet() cpuset.CPUSet
GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet
GetCPUAssignments() ContainerCPUAssignments
}
type writer interface {
SetCPUSet(podUID string, containerName string, cpuset cpuset.CPUSet)
SetDefaultCPUSet(cpuset cpuset.CPUSet)
SetCPUAssignments(ContainerCPUAssignments)
Delete(podUID string, containerName string)
ClearState()
}
// State interface provides methods for tracking and setting cpu/pod assignment
type State interface {
Reader
writer
}

View File

@ -0,0 +1,250 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"fmt"
"path/filepath"
"sync"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/utils/cpuset"
)
var _ State = &stateCheckpoint{}
type stateCheckpoint struct {
mux sync.RWMutex
policyName string
cache State
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
initialContainers containermap.ContainerMap
}
// NewCheckpointState creates new State for keeping track of cpu/pod assignment with checkpoint backend
func NewCheckpointState(stateDir, checkpointName, policyName string, initialContainers containermap.ContainerMap) (State, error) {
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
}
stateCheckpoint := &stateCheckpoint{
cache: NewMemoryState(),
policyName: policyName,
checkpointManager: checkpointManager,
checkpointName: checkpointName,
initialContainers: initialContainers,
}
if err := stateCheckpoint.restoreState(); err != nil {
//nolint:staticcheck // ST1005 user-facing error message
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the CPU manager checkpoint file %q before restarting Kubelet",
err, filepath.Join(stateDir, checkpointName))
}
return stateCheckpoint, nil
}
// migrateV1CheckpointToV2Checkpoint() converts checkpoints from the v1 format to the v2 format
func (sc *stateCheckpoint) migrateV1CheckpointToV2Checkpoint(src *CPUManagerCheckpointV1, dst *CPUManagerCheckpointV2) error {
if src.PolicyName != "" {
dst.PolicyName = src.PolicyName
}
if src.DefaultCPUSet != "" {
dst.DefaultCPUSet = src.DefaultCPUSet
}
for containerID, cset := range src.Entries {
podUID, containerName, err := sc.initialContainers.GetContainerRef(containerID)
if err != nil {
return fmt.Errorf("containerID '%v' not found in initial containers list", containerID)
}
if dst.Entries == nil {
dst.Entries = make(map[string]map[string]string)
}
if _, exists := dst.Entries[podUID]; !exists {
dst.Entries[podUID] = make(map[string]string)
}
dst.Entries[podUID][containerName] = cset
}
return nil
}
// restores state from a checkpoint and creates it if it doesn't exist
func (sc *stateCheckpoint) restoreState() error {
sc.mux.Lock()
defer sc.mux.Unlock()
var err error
checkpointV1 := newCPUManagerCheckpointV1()
checkpointV2 := newCPUManagerCheckpointV2()
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpointV1); err != nil {
checkpointV1 = &CPUManagerCheckpointV1{} // reset it back to 0
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpointV2); err != nil {
if err == errors.ErrCheckpointNotFound {
return sc.storeState()
}
return err
}
}
if err = sc.migrateV1CheckpointToV2Checkpoint(checkpointV1, checkpointV2); err != nil {
return fmt.Errorf("error migrating v1 checkpoint state to v2 checkpoint state: %s", err)
}
if sc.policyName != checkpointV2.PolicyName {
return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpointV2.PolicyName)
}
var tmpDefaultCPUSet cpuset.CPUSet
if tmpDefaultCPUSet, err = cpuset.Parse(checkpointV2.DefaultCPUSet); err != nil {
return fmt.Errorf("could not parse default cpu set %q: %v", checkpointV2.DefaultCPUSet, err)
}
var tmpContainerCPUSet cpuset.CPUSet
tmpAssignments := ContainerCPUAssignments{}
for pod := range checkpointV2.Entries {
tmpAssignments[pod] = make(map[string]cpuset.CPUSet, len(checkpointV2.Entries[pod]))
for container, cpuString := range checkpointV2.Entries[pod] {
if tmpContainerCPUSet, err = cpuset.Parse(cpuString); err != nil {
return fmt.Errorf("could not parse cpuset %q for container %q in pod %q: %v", cpuString, container, pod, err)
}
tmpAssignments[pod][container] = tmpContainerCPUSet
}
}
sc.cache.SetDefaultCPUSet(tmpDefaultCPUSet)
sc.cache.SetCPUAssignments(tmpAssignments)
klog.V(2).InfoS("State checkpoint: restored state from checkpoint")
klog.V(2).InfoS("State checkpoint: defaultCPUSet", "defaultCpuSet", tmpDefaultCPUSet.String())
return nil
}
// saves state to a checkpoint, caller is responsible for locking
func (sc *stateCheckpoint) storeState() error {
checkpoint := NewCPUManagerCheckpoint()
checkpoint.PolicyName = sc.policyName
checkpoint.DefaultCPUSet = sc.cache.GetDefaultCPUSet().String()
assignments := sc.cache.GetCPUAssignments()
for pod := range assignments {
checkpoint.Entries[pod] = make(map[string]string, len(assignments[pod]))
for container, cset := range assignments[pod] {
checkpoint.Entries[pod][container] = cset.String()
}
}
err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
if err != nil {
klog.ErrorS(err, "Failed to save checkpoint")
return err
}
return nil
}
// GetCPUSet returns current CPU set
func (sc *stateCheckpoint) GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool) {
sc.mux.RLock()
defer sc.mux.RUnlock()
res, ok := sc.cache.GetCPUSet(podUID, containerName)
return res, ok
}
// GetDefaultCPUSet returns default CPU set
func (sc *stateCheckpoint) GetDefaultCPUSet() cpuset.CPUSet {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetDefaultCPUSet()
}
// GetCPUSetOrDefault returns current CPU set, or default one if it wasn't changed
func (sc *stateCheckpoint) GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetCPUSetOrDefault(podUID, containerName)
}
// GetCPUAssignments returns current CPU to pod assignments
func (sc *stateCheckpoint) GetCPUAssignments() ContainerCPUAssignments {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetCPUAssignments()
}
// SetCPUSet sets CPU set
func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset cpuset.CPUSet) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetCPUSet(podUID, containerName, cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetDefaultCPUSet sets default CPU set
func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetDefaultCPUSet(cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetCPUAssignments sets CPU to pod assignments
func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetCPUAssignments(a)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// Delete deletes assignment for specified pod
func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// ClearState clears the state and saves it in a checkpoint
func (sc *stateCheckpoint) ClearState() {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}

View File

@ -0,0 +1,117 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"k8s.io/klog/v2"
"k8s.io/utils/cpuset"
)
type stateMemory struct {
sync.RWMutex
assignments ContainerCPUAssignments
defaultCPUSet cpuset.CPUSet
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
klog.InfoS("Initialized new in-memory state store")
return &stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.New(),
}
}
func (s *stateMemory) GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool) {
s.RLock()
defer s.RUnlock()
res, ok := s.assignments[podUID][containerName]
return res.Clone(), ok
}
func (s *stateMemory) GetDefaultCPUSet() cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
return s.defaultCPUSet.Clone()
}
func (s *stateMemory) GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet {
if res, ok := s.GetCPUSet(podUID, containerName); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *stateMemory) GetCPUAssignments() ContainerCPUAssignments {
s.RLock()
defer s.RUnlock()
return s.assignments.Clone()
}
func (s *stateMemory) SetCPUSet(podUID string, containerName string, cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
s.assignments[podUID] = make(map[string]cpuset.CPUSet)
}
s.assignments[podUID][containerName] = cset
klog.InfoS("Updated desired CPUSet", "podUID", podUID, "containerName", containerName, "cpuSet", cset)
}
func (s *stateMemory) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cset
klog.InfoS("Updated default CPUSet", "cpuSet", cset)
}
func (s *stateMemory) SetCPUAssignments(a ContainerCPUAssignments) {
s.Lock()
defer s.Unlock()
s.assignments = a.Clone()
klog.InfoS("Updated CPUSet assignments", "assignments", a)
}
func (s *stateMemory) Delete(podUID string, containerName string) {
s.Lock()
defer s.Unlock()
delete(s.assignments[podUID], containerName)
if len(s.assignments[podUID]) == 0 {
delete(s.assignments, podUID)
}
klog.V(2).InfoS("Deleted CPUSet assignment", "podUID", podUID, "containerName", containerName)
}
func (s *stateMemory) ClearState() {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cpuset.CPUSet{}
s.assignments = make(ContainerCPUAssignments)
klog.V(2).InfoS("Cleared state")
}

View File

@ -0,0 +1,18 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package topology contains helpers for the CPU manager.
package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"

View File

@ -0,0 +1,389 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"fmt"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/klog/v2"
"k8s.io/utils/cpuset"
)
// NUMANodeInfo is a map from NUMANode ID to a list of CPU IDs associated with
// that NUMANode.
type NUMANodeInfo map[int]cpuset.CPUSet
// CPUDetails is a map from CPU ID to Core ID, Socket ID, and NUMA ID.
type CPUDetails map[int]CPUInfo
// CPUTopology contains details of node cpu, where :
// CPU - logical CPU, cadvisor - thread
// Core - physical CPU, cadvisor - Core
// Socket - socket, cadvisor - Socket
// NUMA Node - NUMA cell, cadvisor - Node
// UncoreCache - Split L3 Cache Topology, cadvisor
type CPUTopology struct {
NumCPUs int
NumCores int
NumUncoreCache int
NumSockets int
NumNUMANodes int
CPUDetails CPUDetails
}
// CPUsPerCore returns the number of logical CPUs are associated with
// each core.
func (topo *CPUTopology) CPUsPerCore() int {
if topo.NumCores == 0 {
return 0
}
return topo.NumCPUs / topo.NumCores
}
// CPUsPerSocket returns the number of logical CPUs are associated with
// each socket.
func (topo *CPUTopology) CPUsPerSocket() int {
if topo.NumSockets == 0 {
return 0
}
return topo.NumCPUs / topo.NumSockets
}
// CPUsPerUncore returns the number of logicial CPUs that are associated with
// each UncoreCache
func (topo *CPUTopology) CPUsPerUncore() int {
if topo.NumUncoreCache == 0 {
return 0
}
return topo.NumCPUs / topo.NumUncoreCache
}
// CPUCoreID returns the physical core ID which the given logical CPU
// belongs to.
func (topo *CPUTopology) CPUCoreID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.CoreID, nil
}
// CPUCoreID returns the socket ID which the given logical CPU belongs to.
func (topo *CPUTopology) CPUSocketID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.SocketID, nil
}
// CPUCoreID returns the NUMA node ID which the given logical CPU belongs to.
func (topo *CPUTopology) CPUNUMANodeID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.NUMANodeID, nil
}
// CPUInfo contains the NUMA, socket, UncoreCache and core IDs associated with a CPU.
type CPUInfo struct {
NUMANodeID int
SocketID int
CoreID int
UncoreCacheID int
}
// KeepOnly returns a new CPUDetails object with only the supplied cpus.
func (d CPUDetails) KeepOnly(cpus cpuset.CPUSet) CPUDetails {
result := CPUDetails{}
for cpu, info := range d {
if cpus.Contains(cpu) {
result[cpu] = info
}
}
return result
}
// UncoreCaches returns all the uncorecache Id (L3 Index) associated with the CPUs in this CPUDetails
func (d CPUDetails) UncoreCaches() cpuset.CPUSet {
var numUnCoreIDs []int
for _, info := range d {
numUnCoreIDs = append(numUnCoreIDs, info.UncoreCacheID)
}
return cpuset.New(numUnCoreIDs...)
}
// UnCoresInNUMANodes returns all of the uncore IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) UncoreInNUMANodes(ids ...int) cpuset.CPUSet {
var unCoreIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
unCoreIDs = append(unCoreIDs, info.UncoreCacheID)
}
}
}
return cpuset.New(unCoreIDs...)
}
// CoresNeededInUncoreCache returns either the full list of all available unique core IDs associated with the given
// UnCoreCache IDs in this CPUDetails or subset that matches the ask.
func (d CPUDetails) CoresNeededInUncoreCache(numCoresNeeded int, ids ...int) cpuset.CPUSet {
coreIDs := d.coresInUncoreCache(ids...)
if coreIDs.Size() <= numCoresNeeded {
return coreIDs
}
tmpCoreIDs := coreIDs.List()
return cpuset.New(tmpCoreIDs[:numCoresNeeded]...)
}
// Helper function that just gets the cores
func (d CPUDetails) coresInUncoreCache(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.UncoreCacheID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CPUsInUncoreCaches returns all the logical CPU IDs associated with the given
// UnCoreCache IDs in this CPUDetails
func (d CPUDetails) CPUsInUncoreCaches(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.UncoreCacheID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// NUMANodes returns all of the NUMANode IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) NUMANodes() cpuset.CPUSet {
var numaNodeIDs []int
for _, info := range d {
numaNodeIDs = append(numaNodeIDs, info.NUMANodeID)
}
return cpuset.New(numaNodeIDs...)
}
// NUMANodesInSockets returns all of the logical NUMANode IDs associated with
// the given socket IDs in this CPUDetails.
func (d CPUDetails) NUMANodesInSockets(ids ...int) cpuset.CPUSet {
var numaNodeIDs []int
for _, id := range ids {
for _, info := range d {
if info.SocketID == id {
numaNodeIDs = append(numaNodeIDs, info.NUMANodeID)
}
}
}
return cpuset.New(numaNodeIDs...)
}
// Sockets returns all of the socket IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Sockets() cpuset.CPUSet {
var socketIDs []int
for _, info := range d {
socketIDs = append(socketIDs, info.SocketID)
}
return cpuset.New(socketIDs...)
}
// CPUsInSockets returns all of the logical CPU IDs associated with the given
// socket IDs in this CPUDetails.
func (d CPUDetails) CPUsInSockets(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.SocketID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// SocketsInNUMANodes returns all of the logical Socket IDs associated with the
// given NUMANode IDs in this CPUDetails.
func (d CPUDetails) SocketsInNUMANodes(ids ...int) cpuset.CPUSet {
var socketIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
socketIDs = append(socketIDs, info.SocketID)
}
}
}
return cpuset.New(socketIDs...)
}
// Cores returns all of the core IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Cores() cpuset.CPUSet {
var coreIDs []int
for _, info := range d {
coreIDs = append(coreIDs, info.CoreID)
}
return cpuset.New(coreIDs...)
}
// CoresInNUMANodes returns all of the core IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) CoresInNUMANodes(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CoresInSockets returns all of the core IDs associated with the given socket
// IDs in this CPUDetails.
func (d CPUDetails) CoresInSockets(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.SocketID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CPUs returns all of the logical CPU IDs in this CPUDetails.
func (d CPUDetails) CPUs() cpuset.CPUSet {
var cpuIDs []int
for cpuID := range d {
cpuIDs = append(cpuIDs, cpuID)
}
return cpuset.New(cpuIDs...)
}
// CPUsInNUMANodes returns all of the logical CPU IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) CPUsInNUMANodes(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.NUMANodeID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// CPUsInCores returns all of the logical CPU IDs associated with the given
// core IDs in this CPUDetails.
func (d CPUDetails) CPUsInCores(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.CoreID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
func getUncoreCacheID(core cadvisorapi.Core) int {
if len(core.UncoreCaches) < 1 {
// In case cAdvisor is nil, failback to socket alignment since uncorecache is not shared
return core.SocketID
}
// Even though cadvisor API returns a slice, we only expect either 0 or a 1 uncore caches,
// so everything past the first entry should be discarded or ignored
return core.UncoreCaches[0].Id
}
// Discover returns CPUTopology based on cadvisor node info
func Discover(machineInfo *cadvisorapi.MachineInfo) (*CPUTopology, error) {
if machineInfo.NumCores == 0 {
return nil, fmt.Errorf("could not detect number of cpus")
}
CPUDetails := CPUDetails{}
numPhysicalCores := 0
for _, node := range machineInfo.Topology {
numPhysicalCores += len(node.Cores)
for _, core := range node.Cores {
if coreID, err := getUniqueCoreID(core.Threads); err == nil {
for _, cpu := range core.Threads {
CPUDetails[cpu] = CPUInfo{
CoreID: coreID,
SocketID: core.SocketID,
NUMANodeID: node.Id,
UncoreCacheID: getUncoreCacheID(core),
}
}
} else {
klog.ErrorS(nil, "Could not get unique coreID for socket", "socket", core.SocketID, "core", core.Id, "threads", core.Threads)
return nil, err
}
}
}
return &CPUTopology{
NumCPUs: machineInfo.NumCores,
NumSockets: machineInfo.NumSockets,
NumCores: numPhysicalCores,
NumNUMANodes: CPUDetails.NUMANodes().Size(),
NumUncoreCache: CPUDetails.UncoreCaches().Size(),
CPUDetails: CPUDetails,
}, nil
}
// getUniqueCoreID computes coreId as the lowest cpuID
// for a given Threads []int slice. This will assure that coreID's are
// platform unique (opposite to what cAdvisor reports)
func getUniqueCoreID(threads []int) (coreID int, err error) {
if len(threads) == 0 {
return 0, fmt.Errorf("no cpus provided")
}
if len(threads) != cpuset.New(threads...).Size() {
return 0, fmt.Errorf("cpus provided are not unique")
}
min := threads[0]
for _, thread := range threads[1:] {
if thread < min {
min = thread
}
}
return min, nil
}

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers: []
reviewers:
- klueska
emeritus_approvers:
- vishh
- jiayingz

View File

@ -0,0 +1,109 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checkpoint
import (
"encoding/json"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
// DeviceManagerCheckpoint defines the operations to retrieve pod devices
type DeviceManagerCheckpoint interface {
checkpointmanager.Checkpoint
GetData() ([]PodDevicesEntry, map[string][]string)
}
// DevicesPerNUMA represents device ids obtained from device plugin per NUMA node id
type DevicesPerNUMA map[int64][]string
// PodDevicesEntry connects pod information to devices
type PodDevicesEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs DevicesPerNUMA
AllocResp []byte
}
// checkpointData struct is used to store pod to device allocation information
// in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []PodDevicesEntry
RegisteredDevices map[string][]string
}
// Data holds checkpoint data and its checksum
type Data struct {
Data checkpointData
Checksum checksum.Checksum
}
// NewDevicesPerNUMA is a function that creates DevicesPerNUMA map
func NewDevicesPerNUMA() DevicesPerNUMA {
return make(DevicesPerNUMA)
}
// Devices is a function that returns all device ids for all NUMA nodes
// and represent it as sets.Set[string]
func (dev DevicesPerNUMA) Devices() sets.Set[string] {
result := sets.New[string]()
for _, devs := range dev {
result.Insert(devs...)
}
return result
}
// New returns an instance of Checkpoint - must be an alias for the most recent version
func New(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return newV2(devEntries, devices)
}
func newV2(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return &Data{
Data: checkpointData{
PodDeviceEntries: devEntries,
RegisteredDevices: devices,
},
}
}
// MarshalCheckpoint returns marshalled data
func (cp *Data) MarshalCheckpoint() ([]byte, error) {
cp.Checksum = checksum.New(cp.Data)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint returns unmarshalled data
func (cp *Data) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that passed checksum is same as calculated checksum
func (cp *Data) VerifyChecksum() error {
return cp.Checksum.Verify(cp.Data)
}
// GetData returns device entries and registered devices in the *most recent*
// checkpoint format, *not* in the original format stored on disk.
func (cp *Data) GetData() ([]PodDevicesEntry, map[string][]string) {
return cp.Data.PodDeviceEntries, cp.Data.RegisteredDevices
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"context"
"fmt"
"sync"
"time"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
)
// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error)
allocate(devs []string) (*pluginapi.AllocateResponse, error)
preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
setStopTime(t time.Time)
isStopped() bool
stopGracePeriodExpired() bool
}
type endpointImpl struct {
mutex sync.Mutex
resourceName string
api pluginapi.DevicePluginClient
stopTime time.Time
client plugin.Client // for testing only
}
// newEndpointImpl creates a new endpoint for the given resourceName.
// This is to be used during normal device plugin registration.
func newEndpointImpl(p plugin.DevicePlugin) *endpointImpl {
return &endpointImpl{
api: p.API(),
resourceName: p.Resource(),
}
}
// newStoppedEndpointImpl creates a new endpoint for the given resourceName with stopTime set.
// This is to be used during Kubelet restart, before the actual device plugin re-registers.
func newStoppedEndpointImpl(resourceName string) *endpointImpl {
return &endpointImpl{
resourceName: resourceName,
stopTime: time.Now(),
}
}
func (e *endpointImpl) isStopped() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero()
}
func (e *endpointImpl) stopGracePeriodExpired() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero() && time.Since(e.stopTime) > endpointStopGracePeriod
}
func (e *endpointImpl) setStopTime(t time.Time) {
e.mutex.Lock()
defer e.mutex.Unlock()
e.stopTime = t
}
// getPreferredAllocation issues GetPreferredAllocation gRPC call to the device plugin.
func (e *endpointImpl) getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.GetPreferredAllocation(context.Background(), &pluginapi.PreferredAllocationRequest{
ContainerRequests: []*pluginapi.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: available,
MustIncludeDeviceIDs: mustInclude,
AllocationSize: int32(size),
},
},
})
}
// allocate issues Allocate gRPC call to the device plugin.
func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.Allocate(context.Background(), &pluginapi.AllocateRequest{
ContainerRequests: []*pluginapi.ContainerAllocateRequest{
{DevicesIDs: devs},
},
})
}
// preStartContainer issues PreStartContainer gRPC call to the device plugin.
func (e *endpointImpl) preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
ctx, cancel := context.WithTimeout(context.Background(), pluginapi.KubeletPreStartContainerRPCTimeoutInSecs*time.Second)
defer cancel()
return e.api.PreStartContainer(ctx, &pluginapi.PreStartContainerRequest{
DevicesIDs: devs,
})
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// RegistrationHandler is an interface for handling device plugin registration
// and plugin directory cleanup.
type RegistrationHandler interface {
CleanupPluginDirectory(string) error
}
// ClientHandler is an interface for handling device plugin connections.
type ClientHandler interface {
PluginConnected(string, DevicePlugin) error
PluginDisconnected(string)
PluginListAndWatchReceiver(string, *api.ListAndWatchResponse)
}
// TODO: evaluate whether we need these error definitions.
const (
// errFailedToDialDevicePlugin is the error raised when the device plugin could not be
// reached on the registered socket
errFailedToDialDevicePlugin = "failed to dial device plugin:"
// errUnsupportedVersion is the error raised when the device plugin uses an API version not
// supported by the Kubelet registry
errUnsupportedVersion = "requested API version %q is not supported by kubelet. Supported version is %q"
// errInvalidResourceName is the error raised when a device plugin is registering
// itself with an invalid ResourceName
errInvalidResourceName = "the ResourceName %q is invalid"
// errBadSocket is the error raised when the registry socket path is not absolute
errBadSocket = "bad socketPath, must be an absolute path:"
)

View File

@ -0,0 +1,143 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// DevicePlugin interface provides methods for accessing Device Plugin resources, API and unix socket.
type DevicePlugin interface {
API() api.DevicePluginClient
Resource() string
SocketPath() string
}
// Client interface provides methods for establishing/closing gRPC connection and running the device plugin gRPC client.
type Client interface {
Connect() error
Run()
Disconnect() error
}
type client struct {
mutex sync.Mutex
resource string
socket string
grpc *grpc.ClientConn
handler ClientHandler
client api.DevicePluginClient
}
// NewPluginClient returns an initialized device plugin client.
func NewPluginClient(r string, socketPath string, h ClientHandler) Client {
return &client{
resource: r,
socket: socketPath,
handler: h,
}
}
// Connect is for establishing a gRPC connection between device manager and device plugin.
func (c *client) Connect() error {
client, conn, err := dial(c.socket)
if err != nil {
klog.ErrorS(err, "Unable to connect to device plugin client with socket path", "path", c.socket)
return err
}
c.mutex.Lock()
c.grpc = conn
c.client = client
c.mutex.Unlock()
return c.handler.PluginConnected(c.resource, c)
}
// Run is for running the device plugin gRPC client.
func (c *client) Run() {
stream, err := c.client.ListAndWatch(context.Background(), &api.Empty{})
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
for {
response, err := stream.Recv()
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
klog.V(2).InfoS("State pushed for device plugin", "resource", c.resource, "resourceCapacity", len(response.Devices))
c.handler.PluginListAndWatchReceiver(c.resource, response)
}
}
// Disconnect is for closing gRPC connection between device manager and device plugin.
func (c *client) Disconnect() error {
c.mutex.Lock()
if c.grpc != nil {
if err := c.grpc.Close(); err != nil {
klog.V(2).ErrorS(err, "Failed to close grcp connection", "resource", c.Resource())
}
c.grpc = nil
}
c.mutex.Unlock()
c.handler.PluginDisconnected(c.resource)
return nil
}
func (c *client) Resource() string {
return c.resource
}
func (c *client) API() api.DevicePluginClient {
return c.client
}
func (c *client) SocketPath() string {
return c.socket
}
// dial establishes the gRPC communication with the registered device plugin. https://godoc.org/google.golang.org/grpc#Dial
func dial(unixSocketPath string) (api.DevicePluginClient, *grpc.ClientConn, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
c, err := grpc.DialContext(ctx, unixSocketPath,
grpc.WithAuthority("localhost"),
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}),
)
if err != nil {
return nil, nil, fmt.Errorf(errFailedToDialDevicePlugin+" %v", err)
}
return api.NewDevicePluginClient(c), c, nil
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"fmt"
"os"
"time"
core "k8s.io/api/core/v1"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
func (s *server) GetPluginHandler() cache.PluginHandler {
if f, err := os.Create(s.socketDir + "DEPRECATION"); err != nil {
klog.ErrorS(err, "Failed to create deprecation file at socket dir", "path", s.socketDir)
} else {
f.Close()
klog.V(4).InfoS("Created deprecation file", "path", f.Name())
}
return s
}
func (s *server) RegisterPlugin(pluginName string, endpoint string, versions []string, pluginClientTimeout *time.Duration) error {
klog.V(2).InfoS("Registering plugin at endpoint", "plugin", pluginName, "endpoint", endpoint)
return s.connectClient(pluginName, endpoint)
}
func (s *server) DeRegisterPlugin(pluginName string) {
klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName)
client := s.getClient(pluginName)
if client != nil {
s.disconnectClient(pluginName, client)
}
}
func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
klog.V(2).InfoS("Got plugin at endpoint with versions", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
if !s.isVersionCompatibleWithPlugin(versions...) {
return fmt.Errorf("manager version, %s, is not among plugin supported versions %v", api.Version, versions)
}
if !v1helper.IsExtendedResourceName(core.ResourceName(pluginName)) {
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
}
return nil
}
func (s *server) connectClient(name string, socketPath string) error {
c := NewPluginClient(name, socketPath, s.chandler)
s.registerClient(name, c)
if err := c.Connect(); err != nil {
s.deregisterClient(name)
klog.ErrorS(err, "Failed to connect to new client", "resource", name)
return err
}
go func() {
s.runClient(name, c)
}()
return nil
}
func (s *server) disconnectClient(name string, c Client) error {
s.deregisterClient(name)
return c.Disconnect()
}
func (s *server) registerClient(name string, c Client) {
s.mutex.Lock()
defer s.mutex.Unlock()
s.clients[name] = c
klog.V(2).InfoS("Registered client", "name", name)
}
func (s *server) deregisterClient(name string) {
s.mutex.Lock()
defer s.mutex.Unlock()
delete(s.clients, name)
klog.V(2).InfoS("Deregistered client", "name", name)
}
func (s *server) runClient(name string, c Client) {
c.Run()
c = s.getClient(name)
if c == nil {
return
}
if err := s.disconnectClient(name, c); err != nil {
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
}
}
func (s *server) getClient(name string) Client {
s.mutex.Lock()
defer s.mutex.Unlock()
return s.clients[name]
}

View File

@ -0,0 +1,224 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"sync"
"github.com/opencontainers/selinux/go-selinux"
"google.golang.org/grpc"
core "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// Server interface provides methods for Device plugin registration server.
type Server interface {
cache.PluginHandler
healthz.HealthChecker
Start() error
Stop() error
SocketPath() string
}
type server struct {
socketName string
socketDir string
mutex sync.Mutex
wg sync.WaitGroup
grpc *grpc.Server
rhandler RegistrationHandler
chandler ClientHandler
clients map[string]Client
// isStarted indicates whether the service has started successfully.
isStarted bool
}
// NewServer returns an initialized device plugin registration server.
func NewServer(socketPath string, rh RegistrationHandler, ch ClientHandler) (Server, error) {
if socketPath == "" || !filepath.IsAbs(socketPath) {
return nil, fmt.Errorf(errBadSocket+" %s", socketPath)
}
dir, name := filepath.Split(socketPath)
klog.V(2).InfoS("Creating device plugin registration server", "version", api.Version, "socket", socketPath)
s := &server{
socketName: name,
socketDir: dir,
rhandler: rh,
chandler: ch,
clients: make(map[string]Client),
}
return s, nil
}
func (s *server) Start() error {
klog.V(2).InfoS("Starting device plugin registration server")
if err := os.MkdirAll(s.socketDir, 0750); err != nil {
klog.ErrorS(err, "Failed to create the device plugin socket directory", "directory", s.socketDir)
return err
}
if selinux.GetEnabled() {
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
}
}
// For now, we leave cleanup of the *entire* directory up to the Handler
// (even though we should in theory be able to just wipe the whole directory)
// because the Handler stores its checkpoint file (amongst others) in here.
if err := s.rhandler.CleanupPluginDirectory(s.socketDir); err != nil {
klog.ErrorS(err, "Failed to cleanup the device plugin directory", "directory", s.socketDir)
return err
}
ln, err := net.Listen("unix", s.SocketPath())
if err != nil {
klog.ErrorS(err, "Failed to listen to socket while starting device plugin registry")
return err
}
s.wg.Add(1)
s.grpc = grpc.NewServer([]grpc.ServerOption{}...)
api.RegisterRegistrationServer(s.grpc, s)
go func() {
defer s.wg.Done()
s.setHealthy()
if err = s.grpc.Serve(ln); err != nil {
s.setUnhealthy()
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
return nil
}
func (s *server) Stop() error {
s.visitClients(func(r string, c Client) {
if err := s.disconnectClient(r, c); err != nil {
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
}
})
s.mutex.Lock()
defer s.mutex.Unlock()
if s.grpc == nil {
return nil
}
s.grpc.Stop()
s.wg.Wait()
s.grpc = nil
// During kubelet termination, we do not need the registration server,
// and we consider the kubelet to be healthy even when it is down.
s.setHealthy()
return nil
}
func (s *server) SocketPath() string {
return filepath.Join(s.socketDir, s.socketName)
}
func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Empty, error) {
klog.InfoS("Got registration request from device plugin with resource", "resourceName", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
if !s.isVersionCompatibleWithPlugin(r.Version) {
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
return &api.Empty{}, err
}
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
klog.InfoS("Bad registration request from device plugin", "err", err)
return &api.Empty{}, err
}
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
klog.InfoS("Error connecting to device plugin client", "err", err)
return &api.Empty{}, err
}
return &api.Empty{}, nil
}
func (s *server) isVersionCompatibleWithPlugin(versions ...string) bool {
// TODO(vikasc): Currently this is fine as we only have a single supported version. When we do need to support
// multiple versions in the future, we may need to extend this function to return a supported version.
// E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
// this function should return v1beta1
for _, version := range versions {
for _, supportedVersion := range api.SupportedVersions {
if version == supportedVersion {
return true
}
}
}
return false
}
func (s *server) visitClients(visit func(r string, c Client)) {
s.mutex.Lock()
for r, c := range s.clients {
s.mutex.Unlock()
visit(r, c)
s.mutex.Lock()
}
s.mutex.Unlock()
}
func (s *server) Name() string {
return "device-plugin"
}
func (s *server) Check(_ *http.Request) error {
if s.isStarted {
return nil
}
return fmt.Errorf("device plugin registration gRPC server failed and no device plugins can register")
}
// setHealthy sets the health status of the gRPC server.
func (s *server) setHealthy() {
s.isStarted = true
}
// setUnhealthy sets the health status of the gRPC server to unhealthy.
func (s *server) setUnhealthy() {
s.isStarted = false
}

View File

@ -0,0 +1,388 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
watcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
)
// Stub implementation for DevicePlugin.
type Stub struct {
devs []*pluginapi.Device
socket string
resourceName string
preStartContainerFlag bool
getPreferredAllocationFlag bool
stop chan interface{}
wg sync.WaitGroup
update chan []*pluginapi.Device
server *grpc.Server
// allocFunc is used for handling allocation request
allocFunc stubAllocFunc
// getPreferredAllocFunc is used for handling getPreferredAllocation request
getPreferredAllocFunc stubGetPreferredAllocFunc
// registerControlFunc is used for controlling auto-registration of requests
registerControlFunc stubRegisterControlFunc
registrationStatus chan watcherapi.RegistrationStatus // for testing
endpoint string // for testing
kubeletRestartWatcher *fsnotify.Watcher
}
// stubGetPreferredAllocFunc is the function called when a getPreferredAllocation request is received from Kubelet
type stubGetPreferredAllocFunc func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error)
func defaultGetPreferredAllocFunc(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
var response pluginapi.PreferredAllocationResponse
return &response, nil
}
// stubAllocFunc is the function called when an allocation request is received from Kubelet
type stubAllocFunc func(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error)
func defaultAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
var response pluginapi.AllocateResponse
return &response, nil
}
// stubRegisterControlFunc is the function called when a registration request is received from Kubelet
type stubRegisterControlFunc func() bool
func defaultRegisterControlFunc() bool {
return true
}
// NewDevicePluginStub returns an initialized DevicePlugin Stub.
func NewDevicePluginStub(devs []*pluginapi.Device, socket string, name string, preStartContainerFlag bool, getPreferredAllocationFlag bool) *Stub {
watcher, err := fsnotify.NewWatcher()
if err != nil {
klog.ErrorS(err, "Watcher creation failed")
panic(err)
}
return &Stub{
devs: devs,
socket: socket,
resourceName: name,
preStartContainerFlag: preStartContainerFlag,
getPreferredAllocationFlag: getPreferredAllocationFlag,
registerControlFunc: defaultRegisterControlFunc,
stop: make(chan interface{}),
update: make(chan []*pluginapi.Device),
allocFunc: defaultAllocFunc,
getPreferredAllocFunc: defaultGetPreferredAllocFunc,
kubeletRestartWatcher: watcher,
}
}
// SetGetPreferredAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetGetPreferredAllocFunc(f stubGetPreferredAllocFunc) {
m.getPreferredAllocFunc = f
}
// SetAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetAllocFunc(f stubAllocFunc) {
m.allocFunc = f
}
// SetRegisterControlFunc sets RegisterControlFunc of the device plugin
func (m *Stub) SetRegisterControlFunc(f stubRegisterControlFunc) {
m.registerControlFunc = f
}
// Start starts the gRPC server of the device plugin. Can only
// be called once.
func (m *Stub) Start() error {
klog.InfoS("Starting device plugin server")
err := m.cleanup()
if err != nil {
return err
}
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
m.wg.Add(1)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
watcherapi.RegisterRegistrationServer(m.server, m)
err = m.kubeletRestartWatcher.Add(filepath.Dir(m.socket))
if err != nil {
klog.ErrorS(err, "Failed to add watch", "devicePluginPath", pluginapi.DevicePluginPath)
return err
}
go func() {
defer m.wg.Done()
if err = m.server.Serve(sock); err != nil {
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
var lastDialErr error
wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) {
var conn *grpc.ClientConn
_, conn, lastDialErr = dial(m.socket)
if lastDialErr != nil {
return false, nil
}
conn.Close()
return true, nil
})
if lastDialErr != nil {
return lastDialErr
}
klog.InfoS("Starting to serve on socket", "socket", m.socket)
return nil
}
func (m *Stub) Restart() error {
klog.InfoS("Restarting Device Plugin server")
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
return m.Start()
}
// Stop stops the gRPC server. Can be called without a prior Start
// and more than once. Not safe to be called concurrently by different
// goroutines!
func (m *Stub) Stop() error {
klog.InfoS("Stopping device plugin server")
if m.server == nil {
return nil
}
m.kubeletRestartWatcher.Close()
m.server.Stop()
m.wg.Wait()
m.server = nil
close(m.stop) // This prevents re-starting the server.
return m.cleanup()
}
func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) {
for {
select {
// Detect a kubelet restart by watching for a newly created
// 'pluginapi.KubeletSocket' file. When this occurs, restart
// the device plugin server
case event := <-m.kubeletRestartWatcher.Events:
if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create {
klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint)
var lastErr error
err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) {
restartErr := m.Restart()
if restartErr == nil {
return true, nil
}
klog.ErrorS(restartErr, "Retrying after error")
lastErr = restartErr
return false, nil
})
if err != nil {
klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error())
panic(err)
}
if ok := m.registerControlFunc(); ok {
if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil {
klog.ErrorS(err, "Unable to register to kubelet")
panic(err)
}
}
}
// Watch for any other fs errors and log them.
case err := <-m.kubeletRestartWatcher.Errors:
klog.ErrorS(err, "inotify error")
}
}
}
// GetInfo is the RPC which return pluginInfo
func (m *Stub) GetInfo(ctx context.Context, req *watcherapi.InfoRequest) (*watcherapi.PluginInfo, error) {
klog.InfoS("GetInfo")
return &watcherapi.PluginInfo{
Type: watcherapi.DevicePlugin,
Name: m.resourceName,
Endpoint: m.endpoint,
SupportedVersions: []string{pluginapi.Version}}, nil
}
// NotifyRegistrationStatus receives the registration notification from watcher
func (m *Stub) NotifyRegistrationStatus(ctx context.Context, status *watcherapi.RegistrationStatus) (*watcherapi.RegistrationStatusResponse, error) {
if m.registrationStatus != nil {
m.registrationStatus <- *status
}
if !status.PluginRegistered {
klog.InfoS("Registration failed", "err", status.Error)
}
return &watcherapi.RegistrationStatusResponse{}, nil
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *Stub) Register(kubeletEndpoint, resourceName string, pluginSockDir string) error {
klog.InfoS("Register", "kubeletEndpoint", kubeletEndpoint, "resourceName", resourceName, "socket", pluginSockDir)
if pluginSockDir != "" {
if _, err := os.Stat(pluginSockDir + "DEPRECATION"); err == nil {
klog.InfoS("Deprecation file found. Skip registration")
return nil
}
}
klog.InfoS("Deprecation file not found. Invoke registration")
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := grpc.DialContext(ctx, kubeletEndpoint,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}))
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: filepath.Base(m.socket),
ResourceName: resourceName,
Options: &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
},
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
// Stop server
m.server.Stop()
klog.ErrorS(err, "Client unable to register to kubelet")
return err
}
klog.InfoS("Device Plugin registered with the Kubelet")
return err
}
// GetDevicePluginOptions returns DevicePluginOptions settings for the device plugin.
func (m *Stub) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
options := &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
}
return options, nil
}
// PreStartContainer resets the devices received
func (m *Stub) PreStartContainer(ctx context.Context, r *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
klog.InfoS("PreStartContainer", "request", r)
return &pluginapi.PreStartContainerResponse{}, nil
}
// ListAndWatch lists devices and update that list according to the Update call
func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
klog.InfoS("ListAndWatch")
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case updated := <-m.update:
s.Send(&pluginapi.ListAndWatchResponse{Devices: updated})
}
}
}
// Update allows the device plugin to send new devices through ListAndWatch
func (m *Stub) Update(devs []*pluginapi.Device) {
m.update <- devs
}
// GetPreferredAllocation gets the preferred allocation from a set of available devices
func (m *Stub) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
klog.InfoS("GetPreferredAllocation", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.getPreferredAllocFunc(r, devs)
}
// Allocate does a mock allocation
func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
klog.InfoS("Allocate", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.allocFunc(r, devs)
}
func (m *Stub) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}

View File

@ -0,0 +1,456 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"sync"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type deviceAllocateInfo struct {
// deviceIds contains device Ids allocated to this container for the given resourceName.
deviceIds checkpoint.DevicesPerNUMA
// allocResp contains cached rpc AllocateResponse.
allocResp *pluginapi.ContainerAllocateResponse
}
type resourceAllocateInfo map[string]deviceAllocateInfo // Keyed by resourceName.
type containerDevices map[string]resourceAllocateInfo // Keyed by containerName.
type podDevices struct {
sync.RWMutex
devs map[string]containerDevices // Keyed by podUID.
}
// NewPodDevices is a function that returns object of podDevices type with its own guard
// RWMutex and a map where key is a pod UID and value contains
// container devices information of type containerDevices.
func newPodDevices() *podDevices {
return &podDevices{devs: make(map[string]containerDevices)}
}
func (pdev *podDevices) pods() sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for k := range pdev.devs {
ret.Insert(k)
}
return ret
}
func (pdev *podDevices) size() int {
pdev.RLock()
defer pdev.RUnlock()
return len(pdev.devs)
}
func (pdev *podDevices) hasPod(podUID string) bool {
pdev.RLock()
defer pdev.RUnlock()
_, podExists := pdev.devs[podUID]
return podExists
}
func (pdev *podDevices) insert(podUID, contName, resource string, devices checkpoint.DevicesPerNUMA, resp *pluginapi.ContainerAllocateResponse) {
pdev.Lock()
defer pdev.Unlock()
if _, podExists := pdev.devs[podUID]; !podExists {
pdev.devs[podUID] = make(containerDevices)
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
pdev.devs[podUID][contName] = make(resourceAllocateInfo)
}
pdev.devs[podUID][contName][resource] = deviceAllocateInfo{
deviceIds: devices,
allocResp: resp,
}
}
func (pdev *podDevices) delete(pods []string) {
pdev.Lock()
defer pdev.Unlock()
for _, uid := range pods {
delete(pdev.devs, uid)
}
}
// Returns list of device Ids allocated to the given pod for the given resource.
// Returns nil if we don't have cached state for the given <podUID, resource>.
func (pdev *podDevices) podDevices(podUID, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for contName := range pdev.devs[podUID] {
ret = ret.Union(pdev.containerDevices(podUID, contName, resource))
}
return ret
}
// Returns list of device Ids allocated to the given container for the given resource.
// Returns nil if we don't have cached state for the given <podUID, contName, resource>.
func (pdev *podDevices) containerDevices(podUID, contName, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
devs, resourceExists := pdev.devs[podUID][contName][resource]
if !resourceExists {
return nil
}
return devs.deviceIds.Devices()
}
// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
func (pdev *podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds.Devices())
}
}
// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
func (pdev *podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds.Devices())
}
}
// Returns all devices allocated to the pods being tracked, keyed by resourceName.
func (pdev *podDevices) devices() map[string]sets.Set[string] {
ret := make(map[string]sets.Set[string])
pdev.RLock()
defer pdev.RUnlock()
for _, containerDevices := range pdev.devs {
for _, resources := range containerDevices {
for resource, devices := range resources {
if _, exists := ret[resource]; !exists {
ret[resource] = sets.New[string]()
}
if devices.allocResp != nil {
ret[resource] = ret[resource].Union(devices.deviceIds.Devices())
}
}
}
}
return ret
}
// Returns podUID and containerName for a device
func (pdev *podDevices) getPodAndContainerForDevice(deviceID string) (string, string) {
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for containerName, resources := range containerDevices {
for _, devices := range resources {
if devices.deviceIds.Devices().Has(deviceID) {
return podUID, containerName
}
}
}
}
return "", ""
}
// Turns podDevices to checkpointData.
func (pdev *podDevices) toCheckpointData() []checkpoint.PodDevicesEntry {
var data []checkpoint.PodDevicesEntry
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for conName, resources := range containerDevices {
for resource, devices := range resources {
if devices.allocResp == nil {
klog.ErrorS(nil, "Can't marshal allocResp, allocation response is missing", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
allocResp, err := devices.allocResp.Marshal()
if err != nil {
klog.ErrorS(err, "Can't marshal allocResp", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
data = append(data, checkpoint.PodDevicesEntry{
PodUID: podUID,
ContainerName: conName,
ResourceName: resource,
DeviceIDs: devices.deviceIds,
AllocResp: allocResp})
}
}
}
return data
}
// Populates podDevices from the passed in checkpointData.
func (pdev *podDevices) fromCheckpointData(data []checkpoint.PodDevicesEntry) {
for _, entry := range data {
klog.V(2).InfoS("Get checkpoint entry",
"podUID", entry.PodUID, "containerName", entry.ContainerName,
"resourceName", entry.ResourceName, "deviceIDs", entry.DeviceIDs, "allocated", entry.AllocResp)
allocResp := &pluginapi.ContainerAllocateResponse{}
err := allocResp.Unmarshal(entry.AllocResp)
if err != nil {
klog.ErrorS(err, "Can't unmarshal allocResp", "podUID", entry.PodUID, "containerName", entry.ContainerName, "resourceName", entry.ResourceName)
continue
}
pdev.insert(entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceIDs, allocResp)
}
}
// Returns combined container runtime settings to consume the container's allocated devices.
func (pdev *podDevices) deviceRunContainerOptions(podUID, contName string) *DeviceRunContainerOptions {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return nil
}
resources, exists := containers[contName]
if !exists {
return nil
}
opts := &DeviceRunContainerOptions{}
// Maps to detect duplicate settings.
devsMap := make(map[string]string)
mountsMap := make(map[string]string)
envsMap := make(map[string]string)
annotationsMap := make(map[string]string)
// Keep track of all CDI devices requested for the container.
allCDIDevices := sets.New[string]()
// Loops through AllocationResponses of all cached device resources.
for _, devices := range resources {
resp := devices.allocResp
// Each Allocate response has the following artifacts.
// Environment variables
// Mount points
// Device files
// Container annotations
// CDI device IDs
// These artifacts are per resource per container.
// Updates RunContainerOptions.Envs.
for k, v := range resp.Envs {
if e, ok := envsMap[k]; ok {
klog.V(4).InfoS("Skip existing env", "envKey", k, "envValue", v)
if e != v {
klog.ErrorS(nil, "Environment variable has conflicting setting", "envKey", k, "expected", v, "got", e)
}
continue
}
klog.V(4).InfoS("Add env", "envKey", k, "envValue", v)
envsMap[k] = v
opts.Envs = append(opts.Envs, kubecontainer.EnvVar{Name: k, Value: v})
}
// Updates RunContainerOptions.Devices.
for _, dev := range resp.Devices {
if d, ok := devsMap[dev.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
if d != dev.HostPath {
klog.ErrorS(nil, "Container device has conflicting mapping host devices",
"containerPath", dev.ContainerPath, "got", d, "expected", dev.HostPath)
}
continue
}
klog.V(4).InfoS("Add device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
devsMap[dev.ContainerPath] = dev.HostPath
opts.Devices = append(opts.Devices, kubecontainer.DeviceInfo{
PathOnHost: dev.HostPath,
PathInContainer: dev.ContainerPath,
Permissions: dev.Permissions,
})
}
// Updates RunContainerOptions.Mounts.
for _, mount := range resp.Mounts {
if m, ok := mountsMap[mount.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
if m != mount.HostPath {
klog.ErrorS(nil, "Container mount has conflicting mapping host mounts",
"containerPath", mount.ContainerPath, "conflictingPath", m, "hostPath", mount.HostPath)
}
continue
}
klog.V(4).InfoS("Add mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
mountsMap[mount.ContainerPath] = mount.HostPath
opts.Mounts = append(opts.Mounts, kubecontainer.Mount{
Name: mount.ContainerPath,
ContainerPath: mount.ContainerPath,
HostPath: mount.HostPath,
ReadOnly: mount.ReadOnly,
// TODO: This may need to be part of Device plugin API.
SELinuxRelabel: false,
})
}
// Updates for Annotations
for k, v := range resp.Annotations {
if e, ok := annotationsMap[k]; ok {
klog.V(4).InfoS("Skip existing annotation", "annotationKey", k, "annotationValue", v)
if e != v {
klog.ErrorS(nil, "Annotation has conflicting setting", "annotationKey", k, "expected", e, "got", v)
}
continue
}
klog.V(4).InfoS("Add annotation", "annotationKey", k, "annotationValue", v)
annotationsMap[k] = v
opts.Annotations = append(opts.Annotations, kubecontainer.Annotation{Name: k, Value: v})
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DevicePluginCDIDevices) {
// Updates for CDI devices.
cdiDevices := getCDIDeviceInfo(resp, allCDIDevices)
opts.CDIDevices = append(opts.CDIDevices, cdiDevices...)
}
}
return opts
}
// getCDIDeviceInfo returns CDI devices from an allocate response
func getCDIDeviceInfo(resp *pluginapi.ContainerAllocateResponse, knownCDIDevices sets.Set[string]) []kubecontainer.CDIDevice {
var cdiDevices []kubecontainer.CDIDevice
for _, cdiDevice := range resp.CDIDevices {
if knownCDIDevices.Has(cdiDevice.Name) {
klog.V(4).InfoS("Skip existing CDI Device", "name", cdiDevice.Name)
continue
}
klog.V(4).InfoS("Add CDI device", "name", cdiDevice.Name)
knownCDIDevices.Insert(cdiDevice.Name)
device := kubecontainer.CDIDevice{
Name: cdiDevice.Name,
}
cdiDevices = append(cdiDevices, device)
}
return cdiDevices
}
// getContainerDevices returns the devices assigned to the provided container for all ResourceNames
func (pdev *podDevices) getContainerDevices(podUID, contName string) ResourceDeviceInstances {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
resDev := NewResourceDeviceInstances()
for resource, allocateInfo := range pdev.devs[podUID][contName] {
if len(allocateInfo.deviceIds) == 0 {
continue
}
devicePluginMap := make(map[string]pluginapi.Device)
for numaid, devlist := range allocateInfo.deviceIds {
for _, devID := range devlist {
var topology *pluginapi.TopologyInfo
if numaid != nodeWithoutTopology {
NUMANodes := []*pluginapi.NUMANode{{ID: numaid}}
if pDev, ok := devicePluginMap[devID]; ok && pDev.Topology != nil {
if nodes := pDev.Topology.GetNodes(); nodes != nil {
NUMANodes = append(NUMANodes, nodes...)
}
}
// ID and Healthy are not relevant here.
topology = &pluginapi.TopologyInfo{Nodes: NUMANodes}
}
devicePluginMap[devID] = pluginapi.Device{
Topology: topology,
}
}
}
resDev[resource] = devicePluginMap
}
return resDev
}
// DeviceInstances is a mapping device name -> plugin device data
type DeviceInstances map[string]pluginapi.Device
// ResourceDeviceInstances is a mapping resource name -> DeviceInstances
type ResourceDeviceInstances map[string]DeviceInstances
// NewResourceDeviceInstances returns a new ResourceDeviceInstances
func NewResourceDeviceInstances() ResourceDeviceInstances {
return make(ResourceDeviceInstances)
}
// Clone returns a clone of ResourceDeviceInstances
func (rdev ResourceDeviceInstances) Clone() ResourceDeviceInstances {
clone := NewResourceDeviceInstances()
for resourceName, resourceDevs := range rdev {
clone[resourceName] = make(map[string]pluginapi.Device)
for devID, dev := range resourceDevs {
clone[resourceName][devID] = dev
}
}
return clone
}
// Filter takes a condition set expressed as map[string]sets.Set[string] and returns a new
// ResourceDeviceInstances with only the devices matching the condition set.
func (rdev ResourceDeviceInstances) Filter(cond map[string]sets.Set[string]) ResourceDeviceInstances {
filtered := NewResourceDeviceInstances()
for resourceName, filterIDs := range cond {
if _, exists := rdev[resourceName]; !exists {
continue
}
filtered[resourceName] = DeviceInstances{}
for instanceID, instance := range rdev[resourceName] {
if filterIDs.Has(instanceID) {
filtered[resourceName][instanceID] = instance
}
}
}
return filtered
}

View File

@ -0,0 +1,252 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/resource"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
// GetTopologyHints implements the TopologyManager HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for each
// container are created.
func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
// Loop through all device resources and generate TopologyHints for them.
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getContainerDeviceRequest(container)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Container. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
reusable := m.devicesToReuse[string(pod.UID)][resource]
if available.Union(reusable).Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, reusable, requested)
}
return deviceHints
}
// GetPodTopologyHints implements the topologymanager.HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for Pod are created.
func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getPodDeviceRequest(pod)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Pod. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.podDevices(string(pod.UID), resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
if available.Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, sets.Set[string]{}, requested)
}
return deviceHints
}
func (m *ManagerImpl) deviceHasTopologyAlignment(resource string) bool {
// If any device has Topology NUMANodes available, we assume they care about alignment.
for _, device := range m.allDevices[resource] {
if device.Topology != nil && len(device.Topology.Nodes) > 0 {
return true
}
}
return false
}
func (m *ManagerImpl) getAvailableDevices(resource string) sets.Set[string] {
// Strip all devices in use from the list of healthy ones.
return m.healthyDevices[resource].Difference(m.allocatedDevices[resource])
}
func (m *ManagerImpl) generateDeviceTopologyHints(resource string, available sets.Set[string], reusable sets.Set[string], request int) []topologymanager.TopologyHint {
// Initialize minAffinitySize to include all NUMA Nodes
minAffinitySize := len(m.numaNodes)
// Iterate through all combinations of NUMA Nodes and build hints from them.
hints := []topologymanager.TopologyHint{}
bitmask.IterateBitMasks(m.numaNodes, func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
devicesInMask := 0
for _, device := range m.allDevices[resource] {
if mask.AnySet(m.getNUMANodeIds(device.Topology)) {
devicesInMask++
}
}
if devicesInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if all the reusable devices are part of the bitmask.
numMatching := 0
for d := range reusable {
// Skip the device if it doesn't specify any topology info.
if m.allDevices[resource][d].Topology == nil {
continue
}
// Otherwise disregard this mask if its NUMANode isn't part of it.
if !mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
return
}
numMatching++
}
// Finally, check to see if enough available devices remain on the
// current NUMA node combination to satisfy the device request.
for d := range available {
if mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the NUMA mask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, topologymanager.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinity. Only those with an equal number of bits set will be
// considered preferred.
for i := range hints {
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
func (m *ManagerImpl) getNUMANodeIds(topology *pluginapi.TopologyInfo) []int {
if topology == nil {
return nil
}
var ids []int
for _, n := range topology.Nodes {
ids = append(ids, int(n.ID))
}
return ids
}
func (m *ManagerImpl) getPodDeviceRequest(pod *v1.Pod) map[string]int {
// for these device plugin resources, requests == limits
limits := resource.PodLimits(pod, resource.PodResourcesOptions{
ExcludeOverhead: true,
})
podRequests := make(map[string]int)
for resourceName, quantity := range limits {
if !m.isDevicePluginResource(string(resourceName)) {
continue
}
podRequests[string(resourceName)] = int(quantity.Value())
}
return podRequests
}
func (m *ManagerImpl) getContainerDeviceRequest(container *v1.Container) map[string]int {
containerRequests := make(map[string]int)
for resourceObj, requestedObj := range container.Resources.Limits {
resource := string(resourceObj)
requested := int(requestedObj.Value())
if !m.isDevicePluginResource(resource) {
continue
}
containerRequests[resource] = requested
}
return containerRequests
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
// Manager manages all the Device Plugins running on a node.
type Manager interface {
// Start starts device plugin registration service.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error
// Allocate configures and assigns devices to a container in a pod. From
// the requested device resources, Allocate will communicate with the
// owning device plugin to allow setup procedures to take place, and for
// the device plugin to provide runtime settings to use the device
// (environment variables, mount points and device files).
Allocate(pod *v1.Pod, container *v1.Container) error
// UpdatePluginResources updates node resources based on devices already
// allocated to pods. The node object is provided for the device manager to
// update the node capacity to reflect the currently available devices.
UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
// Stop stops the manager.
Stop() error
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error)
// GetCapacity returns the amount of available device plugin resource capacity, resource allocatable
// and inactive device plugin resources previously registered on the node.
GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
// GetWatcherHandler returns the plugin handler for the device manager.
GetWatcherHandler() cache.PluginHandler
GetHealthChecker() healthz.HealthChecker
// GetDevices returns information about the devices assigned to pods and containers
GetDevices(podUID, containerName string) ResourceDeviceInstances
// UpdateAllocatedResourcesStatus updates the status of allocated resources for the pod.
UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus)
// GetAllocatableDevices returns information about all the devices known to the manager
GetAllocatableDevices() ResourceDeviceInstances
// ShouldResetExtendedResourceCapacity returns whether the extended resources should be reset or not,
// depending on the checkpoint file availability. Absence of the checkpoint file strongly indicates
// the node has been recreated.
ShouldResetExtendedResourceCapacity() bool
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments
GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments per Pod
GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint
// UpdateAllocatedDevices frees any Devices that are bound to terminated pods.
UpdateAllocatedDevices()
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
}
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
type DeviceRunContainerOptions struct {
// The environment variables list.
Envs []kubecontainer.EnvVar
// The mounts for the container.
Mounts []kubecontainer.Mount
// The host devices mapped into the container.
Devices []kubecontainer.DeviceInfo
// The Annotations for the container
Annotations []kubecontainer.Annotation
// CDI Devices for the container
CDIDevices []kubecontainer.CDIDevice
}
// TODO: evaluate whether we need this error definition.
const (
errEndpointStopped = "endpoint %v has been stopped"
)
// endpointStopGracePeriod indicates the grace period after an endpoint is stopped
// because its device plugin fails. DeviceManager keeps the stopped endpoint in its
// cache during this grace period to cover the time gap for the capacity change to
// take effect.
const endpointStopGracePeriod = time.Duration(5) * time.Minute
// kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint
const kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"

21
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/doc.go generated vendored Normal file
View File

@ -0,0 +1,21 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package cm (abbreviation of "container manager") and its subpackages contain all the kubelet code
// to manage containers. For example, they contain functions to configure containers' cgroups,
// ensure containers run with the desired QoS, and allocate compute resources like cpus, memory,
// devices...
package cm // import "k8s.io/kubernetes/pkg/kubelet/cm"

View File

@ -0,0 +1,2 @@
labels:
- wg/device-management

View File

@ -0,0 +1,222 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"errors"
"fmt"
"slices"
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
// ClaimInfo holds information required
// to prepare and unprepare a resource claim.
// +k8s:deepcopy-gen=true
type ClaimInfo struct {
state.ClaimInfoState
prepared bool
}
// claimInfoCache is a cache of processed resource claims keyed by namespace/claimname.
type claimInfoCache struct {
sync.RWMutex
checkpointer state.Checkpointer
claimInfo map[string]*ClaimInfo
}
// newClaimInfoFromClaim creates a new claim info from a resource claim.
// It verifies that the kubelet can handle the claim.
func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) (*ClaimInfo, error) {
claimInfoState := state.ClaimInfoState{
ClaimUID: claim.UID,
ClaimName: claim.Name,
Namespace: claim.Namespace,
PodUIDs: sets.New[string](),
DriverState: make(map[string]state.DriverState),
}
if claim.Status.Allocation == nil {
return nil, errors.New("not allocated")
}
for _, result := range claim.Status.Allocation.Devices.Results {
claimInfoState.DriverState[result.Driver] = state.DriverState{}
}
info := &ClaimInfo{
ClaimInfoState: claimInfoState,
prepared: false,
}
return info, nil
}
// newClaimInfoFromClaim creates a new claim info from a checkpointed claim info state object.
func newClaimInfoFromState(state *state.ClaimInfoState) *ClaimInfo {
info := &ClaimInfo{
ClaimInfoState: *state.DeepCopy(),
prepared: false,
}
return info
}
// setCDIDevices adds a set of CDI devices to the claim info.
func (info *ClaimInfo) addDevice(driverName string, deviceState state.Device) {
if info.DriverState == nil {
info.DriverState = make(map[string]state.DriverState)
}
driverState := info.DriverState[driverName]
driverState.Devices = append(driverState.Devices, deviceState)
info.DriverState[driverName] = driverState
}
// addPodReference adds a pod reference to the claim info.
func (info *ClaimInfo) addPodReference(podUID types.UID) {
info.PodUIDs.Insert(string(podUID))
}
// hasPodReference checks if a pod reference exists in the claim info.
func (info *ClaimInfo) hasPodReference(podUID types.UID) bool {
return info.PodUIDs.Has(string(podUID))
}
// deletePodReference deletes a pod reference from the claim info.
func (info *ClaimInfo) deletePodReference(podUID types.UID) {
info.PodUIDs.Delete(string(podUID))
}
// setPrepared marks the claim info as prepared.
func (info *ClaimInfo) setPrepared() {
info.prepared = true
}
// isPrepared checks if claim info is prepared or not.
func (info *ClaimInfo) isPrepared() bool {
return info.prepared
}
// newClaimInfoCache creates a new claim info cache object, pre-populated from a checkpoint (if present).
func newClaimInfoCache(stateDir, checkpointName string) (*claimInfoCache, error) {
checkpointer, err := state.NewCheckpointer(stateDir, checkpointName)
if err != nil {
return nil, fmt.Errorf("could not initialize checkpoint manager, please drain node and remove dra state file, err: %w", err)
}
checkpoint, err := checkpointer.GetOrCreate()
if err != nil {
return nil, fmt.Errorf("error calling GetOrCreate() on checkpoint state: %w", err)
}
cache := &claimInfoCache{
checkpointer: checkpointer,
claimInfo: make(map[string]*ClaimInfo),
}
entries, err := checkpoint.GetClaimInfoStateList()
if err != nil {
return nil, fmt.Errorf("error calling GetEntries() on checkpoint: %w", err)
}
for _, entry := range entries {
info := newClaimInfoFromState(&entry)
cache.claimInfo[info.Namespace+"/"+info.ClaimName] = info
}
return cache, nil
}
// withLock runs a function while holding the claimInfoCache lock.
func (cache *claimInfoCache) withLock(f func() error) error {
cache.Lock()
defer cache.Unlock()
return f()
}
// withRLock runs a function while holding the claimInfoCache rlock.
func (cache *claimInfoCache) withRLock(f func() error) error {
cache.RLock()
defer cache.RUnlock()
return f()
}
// add adds a new claim info object into the claim info cache.
func (cache *claimInfoCache) add(info *ClaimInfo) *ClaimInfo {
cache.claimInfo[info.Namespace+"/"+info.ClaimName] = info
return info
}
// contains checks to see if a specific claim info object is already in the cache.
func (cache *claimInfoCache) contains(claimName, namespace string) bool {
_, exists := cache.claimInfo[namespace+"/"+claimName]
return exists
}
// get gets a specific claim info object from the cache.
func (cache *claimInfoCache) get(claimName, namespace string) (*ClaimInfo, bool) {
info, exists := cache.claimInfo[namespace+"/"+claimName]
return info, exists
}
// delete deletes a specific claim info object from the cache.
func (cache *claimInfoCache) delete(claimName, namespace string) {
delete(cache.claimInfo, namespace+"/"+claimName)
}
// hasPodReference checks if there is at least one claim
// that is referenced by the pod with the given UID
// This function is used indirectly by the status manager
// to check if pod can enter termination status
func (cache *claimInfoCache) hasPodReference(uid types.UID) bool {
for _, claimInfo := range cache.claimInfo {
if claimInfo.hasPodReference(uid) {
return true
}
}
return false
}
// syncToCheckpoint syncs the full claim info cache state to a checkpoint.
func (cache *claimInfoCache) syncToCheckpoint() error {
claimInfoStateList := make(state.ClaimInfoStateList, 0, len(cache.claimInfo))
for _, infoClaim := range cache.claimInfo {
claimInfoStateList = append(claimInfoStateList, infoClaim.ClaimInfoState)
}
checkpoint, err := state.NewCheckpoint(claimInfoStateList)
if err != nil {
return err
}
return cache.checkpointer.Store(checkpoint)
}
// cdiDevicesAsList returns a list of CDIDevices from the provided claim info.
// When the request name is non-empty, only devices relevant for that request
// are returned.
func (info *ClaimInfo) cdiDevicesAsList(requestName string) []kubecontainer.CDIDevice {
var cdiDevices []kubecontainer.CDIDevice
for _, driverData := range info.DriverState {
for _, device := range driverData.Devices {
if requestName == "" || len(device.RequestNames) == 0 || slices.Contains(device.RequestNames, requestName) {
for _, cdiDeviceID := range device.CDIDeviceIDs {
cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: cdiDeviceID})
}
}
}
}
return cdiDevices
}

View File

@ -0,0 +1,553 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"context"
"fmt"
"strconv"
"time"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/klog/v2"
drapb "k8s.io/kubelet/pkg/apis/dra/v1beta1"
dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// draManagerStateFileName is the file name where dra manager stores its state
const draManagerStateFileName = "dra_manager_state"
// defaultReconcilePeriod is the default reconciliation period to keep all claim info state in sync.
const defaultReconcilePeriod = 60 * time.Second
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
// GetNodeFunc is a function that returns the node object using the kubelet's node lister.
type GetNodeFunc func() (*v1.Node, error)
// ManagerImpl is the structure in charge of managing DRA drivers.
type ManagerImpl struct {
// cache contains cached claim info
cache *claimInfoCache
// reconcilePeriod is the duration between calls to reconcileLoop.
reconcilePeriod time.Duration
// activePods is a method for listing active pods on the node
// so all claim info state can be updated in the reconciliation loop.
activePods ActivePodsFunc
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can treat pods as inactive and react appropriately.
sourcesReady config.SourcesReady
// KubeClient reference
kubeClient clientset.Interface
// getNode is a function that returns the node object using the kubelet's node lister.
getNode GetNodeFunc
}
// NewManagerImpl creates a new manager.
func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, nodeName types.NodeName) (*ManagerImpl, error) {
claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName)
if err != nil {
return nil, fmt.Errorf("failed to create claimInfo cache: %w", err)
}
// TODO: for now the reconcile period is not configurable.
// We should consider making it configurable in the future.
reconcilePeriod := defaultReconcilePeriod
manager := &ManagerImpl{
cache: claimInfoCache,
kubeClient: kubeClient,
reconcilePeriod: reconcilePeriod,
activePods: nil,
sourcesReady: nil,
}
return manager, nil
}
func (m *ManagerImpl) GetWatcherHandler() cache.PluginHandler {
return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode))
}
// Start starts the reconcile loop of the manager.
func (m *ManagerImpl) Start(ctx context.Context, activePods ActivePodsFunc, getNode GetNodeFunc, sourcesReady config.SourcesReady) error {
m.activePods = activePods
m.getNode = getNode
m.sourcesReady = sourcesReady
go wait.UntilWithContext(ctx, func(ctx context.Context) { m.reconcileLoop(ctx) }, m.reconcilePeriod)
return nil
}
// reconcileLoop ensures that any stale state in the manager's claimInfoCache gets periodically reconciled.
func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
logger := klog.FromContext(ctx)
// Only once all sources are ready do we attempt to reconcile.
// This ensures that the call to m.activePods() below will succeed with
// the actual active pods list.
if m.sourcesReady == nil || !m.sourcesReady.AllReady() {
return
}
// Get the full list of active pods.
activePods := sets.New[string]()
for _, p := range m.activePods() {
activePods.Insert(string(p.UID))
}
// Get the list of inactive pods still referenced by any claimInfos.
type podClaims struct {
uid types.UID
namespace string
claimNames []string
}
inactivePodClaims := make(map[string]*podClaims)
m.cache.RLock()
for _, claimInfo := range m.cache.claimInfo {
for podUID := range claimInfo.PodUIDs {
if activePods.Has(podUID) {
continue
}
if inactivePodClaims[podUID] == nil {
inactivePodClaims[podUID] = &podClaims{
uid: types.UID(podUID),
namespace: claimInfo.Namespace,
claimNames: []string{},
}
}
inactivePodClaims[podUID].claimNames = append(inactivePodClaims[podUID].claimNames, claimInfo.ClaimName)
}
}
m.cache.RUnlock()
// Loop through all inactive pods and call UnprepareResources on them.
for _, podClaims := range inactivePodClaims {
if err := m.unprepareResources(ctx, podClaims.uid, podClaims.namespace, podClaims.claimNames); err != nil {
logger.Info("Unpreparing pod resources in reconcile loop failed, will retry", "podUID", podClaims.uid, "err", err)
}
}
}
// PrepareResources attempts to prepare all of the required resources
// for the input container, issue NodePrepareResources rpc requests
// for each new resource requirement, process their responses and update the cached
// containerResources on success.
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
startTime := time.Now()
err := m.prepareResources(ctx, pod)
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
return err
}
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim)
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i]
logger.V(3).Info("Processing resource", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim)
if err != nil {
return fmt.Errorf("prepare resource claim: %w", err)
}
if claimName == nil {
// Nothing to do.
logger.V(5).Info("No need to prepare resources, no claim generated", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
continue
}
// Query claim object from the API server
resourceClaim, err := m.kubeClient.ResourceV1beta1().ResourceClaims(pod.Namespace).Get(
ctx,
*claimName,
metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %w", *claimName, pod.Name, err)
}
if mustCheckOwner {
if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil {
return err
}
}
// Check if pod is in the ReservedFor for the claim
if !resourceclaim.IsReservedForPod(pod, resourceClaim) {
return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)",
pod.Name, pod.UID, *claimName, resourceClaim.UID)
}
// Atomically perform some operations on the claimInfo cache.
err = m.cache.withLock(func() error {
// Get a reference to the claim info for this claim from the cache.
// If there isn't one yet, then add it to the cache.
claimInfo, exists := m.cache.get(resourceClaim.Name, resourceClaim.Namespace)
if !exists {
ci, err := newClaimInfoFromClaim(resourceClaim)
if err != nil {
return fmt.Errorf("claim %s: %w", klog.KObj(resourceClaim), err)
}
claimInfo = m.cache.add(ci)
logger.V(6).Info("Created new claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
} else {
logger.V(6).Info("Found existing claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
}
// Add a reference to the current pod in the claim info.
claimInfo.addPodReference(pod.UID)
// Checkpoint to ensure all claims we plan to prepare are tracked.
// If something goes wrong and the newly referenced pod gets
// deleted without a successful prepare call, we will catch
// that in the reconcile loop and take the appropriate action.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
// If this claim is already prepared, there is no need to prepare it again.
if claimInfo.isPrepared() {
logger.V(5).Info("Resources already prepared", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim))
return nil
}
// This saved claim will be used to update ClaimInfo cache
// after NodePrepareResources GRPC succeeds
resourceClaims[claimInfo.ClaimUID] = resourceClaim
// Loop through all drivers and prepare for calling NodePrepareResources.
claim := &drapb.Claim{
Namespace: claimInfo.Namespace,
UID: string(claimInfo.ClaimUID),
Name: claimInfo.ClaimName,
}
for driverName := range claimInfo.DriverState {
batches[driverName] = append(batches[driverName], claim)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
// Call NodePrepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for driverName, claims := range batches {
// Call NodePrepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(driverName)
if err != nil {
return fmt.Errorf("failed to get gRPC client for driver %s: %w", driverName, err)
}
response, err := client.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodePrepareResources failed: %w", err)
}
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID)
}
if result.GetError() != "" {
return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
}
claim := resourceClaims[types.UID(claimUID)]
// Add the prepared CDI devices to the claim info
err := m.cache.withLock(func() error {
info, exists := m.cache.get(claim.Name, claim.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace)
}
for _, device := range result.GetDevices() {
info.addDevice(driverName, state.Device{PoolName: device.PoolName, DeviceName: device.DeviceName, RequestNames: device.RequestNames, CDIDeviceIDs: device.CDIDeviceIDs})
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodePrepareResources left out %d claims", unfinished)
}
}
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Mark all pod claims as prepared.
for _, claim := range resourceClaims {
info, exists := m.cache.get(claim.Name, claim.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace)
}
info.setPrepared()
}
// Checkpoint to ensure all prepared claims are tracked with their list
// of CDI devices attached.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
return nil
}
func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim {
for _, claim := range claims {
if claim.UID == claimUID {
return claim
}
}
return nil
}
// GetResources gets a ContainerInfo object from the claimInfo cache.
// This information is used by the caller to update a container config.
func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error) {
cdiDevices := []kubecontainer.CDIDevice{}
for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i]
claimName, _, err := resourceclaim.Name(pod, podClaim)
if err != nil {
return nil, fmt.Errorf("list resource claims: %w", err)
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
for _, claim := range container.Resources.Claims {
if podClaim.Name != claim.Name {
continue
}
err := m.cache.withRLock(func() error {
claimInfo, exists := m.cache.get(*claimName, pod.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace)
}
// As of Kubernetes 1.31, CDI device IDs are not passed via annotations anymore.
cdiDevices = append(cdiDevices, claimInfo.cdiDevicesAsList(claim.Request)...)
return nil
})
if err != nil {
return nil, fmt.Errorf("locked cache operation: %w", err)
}
}
}
return &ContainerInfo{CDIDevices: cdiDevices}, nil
}
// UnprepareResources calls a driver's NodeUnprepareResource API for each resource claim owned by a pod.
// This function is idempotent and may be called multiple times against the same pod.
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
// already been successfully unprepared.
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
var err error = nil
defer func(startTime time.Time) {
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
}(time.Now())
var claimNames []string
for i := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
if err != nil {
return fmt.Errorf("unprepare resource claim: %w", err)
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
claimNames = append(claimNames, *claimName)
}
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
return err
}
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim)
claimNamesMap := make(map[types.UID]string)
for _, claimName := range claimNames {
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Get the claim info from the cache
claimInfo, exists := m.cache.get(claimName, namespace)
// Skip calling NodeUnprepareResource if claim info is not cached
if !exists {
return nil
}
// Skip calling NodeUnprepareResource if other pods are still referencing it
if len(claimInfo.PodUIDs) > 1 {
// We delay checkpointing of this change until
// UnprepareResources returns successfully. It is OK to do
// this because we will only return successfully from this call
// if the checkpoint has succeeded. That means if the kubelet
// is ever restarted before this checkpoint succeeds, we will
// simply call into this (idempotent) function again.
claimInfo.deletePodReference(podUID)
return nil
}
// This claimInfo name will be used to update ClaimInfo cache
// after NodeUnprepareResources GRPC succeeds
claimNamesMap[claimInfo.ClaimUID] = claimInfo.ClaimName
// Loop through all drivers and prepare for calling NodeUnprepareResources.
claim := &drapb.Claim{
Namespace: claimInfo.Namespace,
UID: string(claimInfo.ClaimUID),
Name: claimInfo.ClaimName,
}
for driverName := range claimInfo.DriverState {
batches[driverName] = append(batches[driverName], claim)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
// Call NodeUnprepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for driverName, claims := range batches {
// Call NodeUnprepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(driverName)
if err != nil {
return fmt.Errorf("get gRPC client for DRA driver %s: %w", driverName, err)
}
response, err := client.NodeUnprepareResources(ctx, &drapb.NodeUnprepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodeUnprepareResources failed: %w", err)
}
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID)
}
if result.GetError() != "" {
return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
}
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished)
}
}
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Delete all claimInfos from the cache that have just been unprepared.
for _, claimName := range claimNamesMap {
claimInfo, _ := m.cache.get(claimName, namespace)
m.cache.delete(claimName, namespace)
logger.V(6).Info("Deleted claim info cache entry", "claim", klog.KRef(namespace, claimName), "claimInfoEntry", claimInfo)
}
// Atomically sync the cache back to the checkpoint.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
return nil
}
// PodMightNeedToUnprepareResources returns true if the pod might need to
// unprepare resources
func (m *ManagerImpl) PodMightNeedToUnprepareResources(uid types.UID) bool {
m.cache.Lock()
defer m.cache.Unlock()
return m.cache.hasPodReference(uid)
}
// GetContainerClaimInfos gets Container's ClaimInfo
func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error) {
claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims))
for i, podResourceClaim := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
if err != nil {
return nil, fmt.Errorf("determine resource claim information: %w", err)
}
for _, claim := range container.Resources.Claims {
if podResourceClaim.Name != claim.Name {
continue
}
err := m.cache.withRLock(func() error {
claimInfo, exists := m.cache.get(*claimName, pod.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace)
}
claimInfos = append(claimInfos, claimInfo.DeepCopy())
return nil
})
if err != nil {
return nil, fmt.Errorf("locked cache operation: %w", err)
}
}
}
return claimInfos, nil
}

View File

@ -0,0 +1,181 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"context"
"errors"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/status"
"k8s.io/klog/v2"
drapbv1alpha4 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1beta1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
// NewDRAPluginClient returns a wrapper around those gRPC methods of a DRA
// driver kubelet plugin which need to be called by kubelet. The wrapper
// handles gRPC connection management and logging. Connections are reused
// across different NewDRAPluginClient calls.
func NewDRAPluginClient(pluginName string) (*Plugin, error) {
if pluginName == "" {
return nil, fmt.Errorf("plugin name is empty")
}
existingPlugin := draPlugins.get(pluginName)
if existingPlugin == nil {
return nil, fmt.Errorf("plugin name %s not found in the list of registered DRA plugins", pluginName)
}
return existingPlugin, nil
}
type Plugin struct {
name string
backgroundCtx context.Context
cancel func(cause error)
mutex sync.Mutex
conn *grpc.ClientConn
endpoint string
chosenService string // e.g. drapbv1beta1.DRAPluginService
clientCallTimeout time.Duration
}
func (p *Plugin) getOrCreateGRPCConn() (*grpc.ClientConn, error) {
p.mutex.Lock()
defer p.mutex.Unlock()
if p.conn != nil {
return p.conn, nil
}
ctx := p.backgroundCtx
logger := klog.FromContext(ctx)
network := "unix"
logger.V(4).Info("Creating new gRPC connection", "protocol", network, "endpoint", p.endpoint)
// grpc.Dial is deprecated. grpc.NewClient should be used instead.
// For now this gets ignored because this function is meant to establish
// the connection, with the one second timeout below. Perhaps that
// approach should be reconsidered?
//nolint:staticcheck
conn, err := grpc.Dial(
p.endpoint,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithContextDialer(func(ctx context.Context, target string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, network, target)
}),
grpc.WithChainUnaryInterceptor(newMetricsInterceptor(p.name)),
)
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
if ok := conn.WaitForStateChange(ctx, connectivity.Connecting); !ok {
return nil, errors.New("timed out waiting for gRPC connection to be ready")
}
p.conn = conn
return p.conn, nil
}
func (p *Plugin) NodePrepareResources(
ctx context.Context,
req *drapbv1beta1.NodePrepareResourcesRequest,
opts ...grpc.CallOption,
) (*drapbv1beta1.NodePrepareResourcesResponse, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info("Calling NodePrepareResources rpc", "request", req)
conn, err := p.getOrCreateGRPCConn()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()
var response *drapbv1beta1.NodePrepareResourcesResponse
switch p.chosenService {
case drapbv1beta1.DRAPluginService:
nodeClient := drapbv1beta1.NewDRAPluginClient(conn)
response, err = nodeClient.NodePrepareResources(ctx, req)
case drapbv1alpha4.NodeService:
nodeClient := drapbv1alpha4.NewNodeClient(conn)
response, err = drapbv1alpha4.V1Alpha4ClientWrapper{NodeClient: nodeClient}.NodePrepareResources(ctx, req)
default:
// Shouldn't happen, validateSupportedServices should only
// return services we support here.
return nil, fmt.Errorf("internal error: unsupported chosen service: %q", p.chosenService)
}
logger.V(4).Info("Done calling NodePrepareResources rpc", "response", response, "err", err)
return response, err
}
func (p *Plugin) NodeUnprepareResources(
ctx context.Context,
req *drapbv1beta1.NodeUnprepareResourcesRequest,
opts ...grpc.CallOption,
) (*drapbv1beta1.NodeUnprepareResourcesResponse, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info("Calling NodeUnprepareResource rpc", "request", req)
conn, err := p.getOrCreateGRPCConn()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()
var response *drapbv1beta1.NodeUnprepareResourcesResponse
switch p.chosenService {
case drapbv1beta1.DRAPluginService:
nodeClient := drapbv1beta1.NewDRAPluginClient(conn)
response, err = nodeClient.NodeUnprepareResources(ctx, req)
case drapbv1alpha4.NodeService:
nodeClient := drapbv1alpha4.NewNodeClient(conn)
response, err = drapbv1alpha4.V1Alpha4ClientWrapper{NodeClient: nodeClient}.NodeUnprepareResources(ctx, req)
default:
// Shouldn't happen, validateSupportedServices should only
// return services we support here.
return nil, fmt.Errorf("internal error: unsupported chosen service: %q", p.chosenService)
}
logger.V(4).Info("Done calling NodeUnprepareResources rpc", "response", response, "err", err)
return response, err
}
func newMetricsInterceptor(pluginName string) grpc.UnaryClientInterceptor {
return func(ctx context.Context, method string, req, reply any, conn *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
start := time.Now()
err := invoker(ctx, method, req, reply, conn, opts...)
metrics.DRAGRPCOperationsDuration.WithLabelValues(pluginName, method, status.Code(err).String()).Observe(time.Since(start).Seconds())
return err
}
}

View File

@ -0,0 +1,79 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"errors"
"sync"
)
// PluginsStore holds a list of DRA Plugins.
type pluginsStore struct {
sync.RWMutex
store map[string]*Plugin
}
// draPlugins map keeps track of all registered DRA plugins on the node
// and their corresponding sockets.
var draPlugins = &pluginsStore{}
// Get lets you retrieve a DRA Plugin by name.
// This method is protected by a mutex.
func (s *pluginsStore) get(pluginName string) *Plugin {
s.RLock()
defer s.RUnlock()
return s.store[pluginName]
}
// Set lets you save a DRA Plugin to the list and give it a specific name.
// This method is protected by a mutex.
func (s *pluginsStore) add(p *Plugin) (replacedPlugin *Plugin, replaced bool) {
s.Lock()
defer s.Unlock()
if s.store == nil {
s.store = make(map[string]*Plugin)
}
replacedPlugin, exists := s.store[p.name]
s.store[p.name] = p
if replacedPlugin != nil && replacedPlugin.cancel != nil {
replacedPlugin.cancel(errors.New("plugin got replaced"))
}
return replacedPlugin, exists
}
// Delete lets you delete a DRA Plugin by name.
// This method is protected by a mutex.
func (s *pluginsStore) delete(pluginName string) *Plugin {
s.Lock()
defer s.Unlock()
p, exists := s.store[pluginName]
if !exists {
return nil
}
if p.cancel != nil {
p.cancel(errors.New("plugin got removed"))
}
delete(s.store, pluginName)
return p
}

View File

@ -0,0 +1,249 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"context"
"errors"
"fmt"
"slices"
"time"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
drapbv1alpha4 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1beta1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// defaultClientCallTimeout is the default amount of time that a DRA driver has
// to respond to any of the gRPC calls. kubelet uses this value by passing nil
// to RegisterPlugin. Some tests use a different, usually shorter timeout to
// speed up testing.
//
// This is half of the kubelet retry period (according to
// https://github.com/kubernetes/kubernetes/commit/0449cef8fd5217d394c5cd331d852bd50983e6b3).
const defaultClientCallTimeout = 45 * time.Second
// RegistrationHandler is the handler which is fed to the pluginwatcher API.
type RegistrationHandler struct {
// backgroundCtx is used for all future activities of the handler.
// This is necessary because it implements APIs which don't
// provide a context.
backgroundCtx context.Context
kubeClient kubernetes.Interface
getNode func() (*v1.Node, error)
}
var _ cache.PluginHandler = &RegistrationHandler{}
// NewPluginHandler returns new registration handler.
//
// Must only be called once per process because it manages global state.
// If a kubeClient is provided, then it synchronizes ResourceSlices
// with the resource information provided by plugins.
func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *RegistrationHandler {
handler := &RegistrationHandler{
// The context and thus logger should come from the caller.
backgroundCtx: klog.NewContext(context.TODO(), klog.LoggerWithName(klog.TODO(), "DRA registration handler")),
kubeClient: kubeClient,
getNode: getNode,
}
// When kubelet starts up, no DRA driver has registered yet. None of
// the drivers are usable until they come back, which might not happen
// at all. Therefore it is better to not advertise any local resources
// because pods could get stuck on the node waiting for the driver
// to start up.
//
// This has to run in the background.
go handler.wipeResourceSlices("")
return handler
}
// wipeResourceSlices deletes ResourceSlices of the node, optionally just for a specific driver.
func (h *RegistrationHandler) wipeResourceSlices(driver string) {
if h.kubeClient == nil {
return
}
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
backoff := wait.Backoff{
Duration: time.Second,
Factor: 2,
Jitter: 0.2,
Cap: 5 * time.Minute,
Steps: 100,
}
// Error logging is done inside the loop. Context cancellation doesn't get logged.
_ = wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
node, err := h.getNode()
if apierrors.IsNotFound(err) {
return false, nil
}
if err != nil {
logger.Error(err, "Unexpected error checking for node")
return false, nil
}
fieldSelector := fields.Set{resourceapi.ResourceSliceSelectorNodeName: node.Name}
if driver != "" {
fieldSelector[resourceapi.ResourceSliceSelectorDriver] = driver
}
err = h.kubeClient.ResourceV1beta1().ResourceSlices().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{FieldSelector: fieldSelector.String()})
switch {
case err == nil:
logger.V(3).Info("Deleted ResourceSlices", "fieldSelector", fieldSelector)
return true, nil
case apierrors.IsUnauthorized(err):
// This can happen while kubelet is still figuring out
// its credentials.
logger.V(5).Info("Deleting ResourceSlice failed, retrying", "fieldSelector", fieldSelector, "err", err)
return false, nil
default:
// Log and retry for other errors.
logger.V(3).Info("Deleting ResourceSlice failed, retrying", "fieldSelector", fieldSelector, "err", err)
return false, nil
}
})
}
// RegisterPlugin is called when a plugin can be registered.
//
// DRA uses the version array in the registration API to enumerate all gRPC
// services that the plugin provides, using the "<gRPC package name>.<service
// name>" format (e.g. "v1beta1.DRAPlugin"). This allows kubelet to determine
// in advance which version to use resp. which optional services the plugin
// supports.
func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string, supportedServices []string, pluginClientTimeout *time.Duration) error {
// Prepare a context with its own logger for the plugin.
//
// The lifecycle of the plugin's background activities is tied to our
// root context, so canceling that will also cancel the plugin.
//
// The logger injects the plugin name as additional value
// into all log output related to the plugin.
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
ctx = klog.NewContext(ctx, logger)
logger.V(3).Info("Register new DRA plugin", "endpoint", endpoint)
chosenService, err := h.validateSupportedServices(pluginName, supportedServices)
if err != nil {
return fmt.Errorf("version check of plugin %s failed: %w", pluginName, err)
}
var timeout time.Duration
if pluginClientTimeout == nil {
timeout = defaultClientCallTimeout
} else {
timeout = *pluginClientTimeout
}
ctx, cancel := context.WithCancelCause(ctx)
pluginInstance := &Plugin{
name: pluginName,
backgroundCtx: ctx,
cancel: cancel,
conn: nil,
endpoint: endpoint,
chosenService: chosenService,
clientCallTimeout: timeout,
}
// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key
// all other DRA components will be able to get the actual socket of DRA plugins by its name.
if oldPlugin, replaced := draPlugins.add(pluginInstance); replaced {
logger.V(1).Info("DRA plugin already registered, the old plugin was replaced and will be forgotten by the kubelet till the next kubelet restart", "oldEndpoint", oldPlugin.endpoint)
}
return nil
}
// validateSupportedServices identifies the highest supported gRPC service for
// NodePrepareResources and NodeUnprepareResources and returns its name
// (e.g. [drapbv1beta1.DRAPluginService]). An error is returned if the plugin
// is unusable.
func (h *RegistrationHandler) validateSupportedServices(pluginName string, supportedServices []string) (string, error) {
if len(supportedServices) == 0 {
return "", errors.New("empty list of supported gRPC services (aka supported versions)")
}
// Pick most recent version if available.
chosenService := ""
for _, service := range []string{
// Sorted by most recent first, oldest last.
drapbv1beta1.DRAPluginService,
drapbv1alpha4.NodeService,
} {
if slices.Contains(supportedServices, service) {
chosenService = service
break
}
}
// Fall back to alpha if necessary because
// plugins at that time didn't advertise gRPC services.
if chosenService == "" {
chosenService = drapbv1alpha4.NodeService
}
return chosenService, nil
}
// DeRegisterPlugin is called when a plugin has removed its socket,
// signaling it is no longer available.
func (h *RegistrationHandler) DeRegisterPlugin(pluginName string) {
if p := draPlugins.delete(pluginName); p != nil {
logger := klog.FromContext(p.backgroundCtx)
logger.V(3).Info("Deregister DRA plugin", "endpoint", p.endpoint)
// Clean up the ResourceSlices for the deleted Plugin since it
// may have died without doing so itself and might never come
// back.
go h.wipeResourceSlices(pluginName)
return
}
logger := klog.FromContext(h.backgroundCtx)
logger.V(3).Info("Deregister DRA plugin not necessary, was already removed")
}
// ValidatePlugin is called by kubelet's plugin watcher upon detection
// of a new registration socket opened by DRA plugin.
func (h *RegistrationHandler) ValidatePlugin(pluginName string, endpoint string, supportedServices []string) error {
_, err := h.validateSupportedServices(pluginName, supportedServices)
if err != nil {
return fmt.Errorf("invalid versions of plugin %s: %w", pluginName, err)
}
return err
}

View File

@ -0,0 +1,107 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"hash/crc32"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
const (
CheckpointAPIGroup = "checkpoint.dra.kubelet.k8s.io"
CheckpointKind = "DRACheckpoint"
CheckpointAPIVersion = CheckpointAPIGroup + "/v1"
)
// Checkpoint represents a structure to store DRA checkpoint data
type Checkpoint struct {
// Data is a JSON serialized checkpoint data
Data string
// Checksum is a checksum of Data
Checksum uint32
}
type CheckpointData struct {
metav1.TypeMeta
ClaimInfoStateList ClaimInfoStateList
}
// NewCheckpoint creates a new checkpoint from a list of claim info states
func NewCheckpoint(data ClaimInfoStateList) (*Checkpoint, error) {
cpData := &CheckpointData{
TypeMeta: metav1.TypeMeta{
Kind: CheckpointKind,
APIVersion: CheckpointAPIVersion,
},
ClaimInfoStateList: data,
}
cpDataBytes, err := json.Marshal(cpData)
if err != nil {
return nil, err
}
cp := &Checkpoint{
Data: string(cpDataBytes),
Checksum: crc32.ChecksumIEEE(cpDataBytes),
}
return cp, nil
}
// MarshalCheckpoint marshals checkpoint to JSON
func (cp *Checkpoint) MarshalCheckpoint() ([]byte, error) {
return json.Marshal(cp)
}
// UnmarshalCheckpoint unmarshals checkpoint from JSON
// and verifies its data checksum
func (cp *Checkpoint) UnmarshalCheckpoint(blob []byte) error {
if err := json.Unmarshal(blob, cp); err != nil {
return err
}
// verify checksum
if err := cp.VerifyChecksum(); err != nil {
return err
}
return nil
}
// VerifyChecksum verifies that current checksum
// of checkpointed Data is valid
func (cp *Checkpoint) VerifyChecksum() error {
expectedCS := crc32.ChecksumIEEE([]byte(cp.Data))
if expectedCS != cp.Checksum {
return &errors.CorruptCheckpointError{ActualCS: uint64(cp.Checksum), ExpectedCS: uint64(expectedCS)}
}
return nil
}
// GetClaimInfoStateList returns list of claim info states from checkpoint
func (cp *Checkpoint) GetClaimInfoStateList() (ClaimInfoStateList, error) {
var data CheckpointData
if err := json.Unmarshal([]byte(cp.Data), &data); err != nil {
return nil, err
}
return data.ClaimInfoStateList, nil
}

View File

@ -0,0 +1,98 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"errors"
"fmt"
"sync"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
checkpointerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
type Checkpointer interface {
GetOrCreate() (*Checkpoint, error)
Store(*Checkpoint) error
}
type checkpointer struct {
sync.RWMutex
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
}
// NewCheckpointer creates new checkpointer for keeping track of claim info with checkpoint backend
func NewCheckpointer(stateDir, checkpointName string) (Checkpointer, error) {
if len(checkpointName) == 0 {
return nil, fmt.Errorf("received empty string instead of checkpointName")
}
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %w", err)
}
checkpointer := &checkpointer{
checkpointManager: checkpointManager,
checkpointName: checkpointName,
}
return checkpointer, nil
}
// GetOrCreate gets list of claim info states from a checkpoint
// or creates empty list if checkpoint doesn't exist
func (sc *checkpointer) GetOrCreate() (*Checkpoint, error) {
sc.Lock()
defer sc.Unlock()
checkpoint, err := NewCheckpoint(nil)
if err != nil {
return nil, fmt.Errorf("failed to create new checkpoint: %w", err)
}
err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint)
if errors.Is(err, checkpointerrors.ErrCheckpointNotFound) {
err = sc.store(checkpoint)
if err != nil {
return nil, fmt.Errorf("failed to store checkpoint %v: %w", sc.checkpointName, err)
}
return checkpoint, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err)
}
return checkpoint, nil
}
// Store stores checkpoint to the file
func (sc *checkpointer) Store(checkpoint *Checkpoint) error {
sc.Lock()
defer sc.Unlock()
return sc.store(checkpoint)
}
// store saves state to a checkpoint, caller is responsible for locking
func (sc *checkpointer) store(checkpoint *Checkpoint) error {
if err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint); err != nil {
return fmt.Errorf("could not save checkpoint %s: %w", sc.checkpointName, err)
}
return nil
}

View File

@ -0,0 +1,59 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
)
type ClaimInfoStateList []ClaimInfoState
// +k8s:deepcopy-gen=true
type ClaimInfoState struct {
// ClaimUID is the UID of a resource claim
ClaimUID types.UID
// ClaimName is the name of a resource claim
ClaimName string
// Namespace is a claim namespace
Namespace string
// PodUIDs is a set of pod UIDs that reference a resource
PodUIDs sets.Set[string]
// DriverState contains information about all drivers which have allocation
// results in the claim, even if they don't provide devices for their results.
DriverState map[string]DriverState
}
// DriverState is used to store per-device claim info state in a checkpoint
// +k8s:deepcopy-gen=true
type DriverState struct {
Devices []Device
}
// Device is how a DRA driver described an allocated device in a claim
// to kubelet. RequestName and CDI device IDs are optional.
// +k8s:deepcopy-gen=true
type Device struct {
PoolName string
DeviceName string
RequestNames []string
CDIDeviceIDs []string
}

View File

@ -0,0 +1,105 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package state
import (
sets "k8s.io/apimachinery/pkg/util/sets"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ClaimInfoState) DeepCopyInto(out *ClaimInfoState) {
*out = *in
if in.PodUIDs != nil {
in, out := &in.PodUIDs, &out.PodUIDs
*out = make(sets.Set[string], len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.DriverState != nil {
in, out := &in.DriverState, &out.DriverState
*out = make(map[string]DriverState, len(*in))
for key, val := range *in {
(*out)[key] = *val.DeepCopy()
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClaimInfoState.
func (in *ClaimInfoState) DeepCopy() *ClaimInfoState {
if in == nil {
return nil
}
out := new(ClaimInfoState)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Device) DeepCopyInto(out *Device) {
*out = *in
if in.RequestNames != nil {
in, out := &in.RequestNames, &out.RequestNames
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.CDIDeviceIDs != nil {
in, out := &in.CDIDeviceIDs, &out.CDIDeviceIDs
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Device.
func (in *Device) DeepCopy() *Device {
if in == nil {
return nil
}
out := new(Device)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DriverState) DeepCopyInto(out *DriverState) {
*out = *in
if in.Devices != nil {
in, out := &in.Devices, &out.Devices
*out = make([]Device, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DriverState.
func (in *DriverState) DeepCopy() *DriverState {
if in == nil {
return nil
}
out := new(DriverState)
in.DeepCopyInto(out)
return out
}

View File

@ -0,0 +1,61 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// Manager manages all the DRA resource plugins running on a node.
type Manager interface {
// GetWatcherHandler returns the plugin handler for the DRA.
GetWatcherHandler() cache.PluginHandler
// Start starts the reconcile loop of the manager.
// This will ensure that all claims are unprepared even if pods get deleted unexpectedly.
Start(ctx context.Context, activePods ActivePodsFunc, getNode GetNodeFunc, sourcesReady config.SourcesReady) error
// PrepareResources prepares resources for a pod.
// It communicates with the DRA resource plugin to prepare resources.
PrepareResources(ctx context.Context, pod *v1.Pod) error
// UnprepareResources calls NodeUnprepareResource GRPC from DRA plugin to unprepare pod resources
UnprepareResources(ctx context.Context, pod *v1.Pod) error
// GetResources gets a ContainerInfo object from the claimInfo cache.
// This information is used by the caller to update a container config.
GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error)
// PodMightNeedToUnprepareResources returns true if the pod with the given UID
// might need to unprepare resources.
PodMightNeedToUnprepareResources(UID types.UID) bool
// GetContainerClaimInfos gets Container ClaimInfo objects
GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error)
}
// ContainerInfo contains information required by the runtime to consume prepared resources.
type ContainerInfo struct {
// CDI Devices for the container
CDIDevices []kubecontainer.CDIDevice
}

View File

@ -0,0 +1,39 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package dra
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ClaimInfo) DeepCopyInto(out *ClaimInfo) {
*out = *in
in.ClaimInfoState.DeepCopyInto(&out.ClaimInfoState)
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClaimInfo.
func (in *ClaimInfo) DeepCopy() *ClaimInfo {
if in == nil {
return nil
}
out := new(ClaimInfo)
in.DeepCopyInto(out)
return out
}

View File

@ -0,0 +1,270 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type FakeContainerManager struct {
sync.Mutex
CalledFunctions []string
PodContainerManager *FakePodContainerManager
shouldResetExtendedResourceCapacity bool
}
var _ ContainerManager = &FakeContainerManager{}
func NewFakeContainerManager() *FakeContainerManager {
return &FakeContainerManager{
PodContainerManager: NewFakePodContainerManager(),
}
}
func (cm *FakeContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "Start")
return nil
}
func (cm *FakeContainerManager) SystemCgroupsLimit() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "SystemCgroupsLimit")
return v1.ResourceList{}
}
func (cm *FakeContainerManager) GetNodeConfig() NodeConfig {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetNodeConfig")
return NodeConfig{}
}
func (cm *FakeContainerManager) GetMountedSubsystems() *CgroupSubsystems {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetMountedSubsystems")
return &CgroupSubsystems{}
}
func (cm *FakeContainerManager) GetQOSContainersInfo() QOSContainersInfo {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "QOSContainersInfo")
return QOSContainersInfo{}
}
func (cm *FakeContainerManager) UpdateQOSCgroups() error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdateQOSCgroups")
return nil
}
func (cm *FakeContainerManager) Status() Status {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "Status")
return Status{}
}
func (cm *FakeContainerManager) GetNodeAllocatableReservation() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetNodeAllocatableReservation")
return nil
}
func (cm *FakeContainerManager) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetCapacity")
if !localStorageCapacityIsolation {
return v1.ResourceList{}
}
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(0),
resource.BinarySI),
}
return c
}
func (cm *FakeContainerManager) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPluginRegistrationHandlers")
return nil
}
func (cm *FakeContainerManager) GetHealthCheckers() []healthz.HealthChecker {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPluginRegistrationServerChecker")
return []healthz.HealthChecker{}
}
func (cm *FakeContainerManager) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetDevicePluginResourceCapacity")
return nil, nil, []string{}
}
func (cm *FakeContainerManager) NewPodContainerManager() PodContainerManager {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "PodContainerManager")
return cm.PodContainerManager
}
func (cm *FakeContainerManager) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetResources")
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *FakeContainerManager) UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdatePluginResources")
return nil
}
func (cm *FakeContainerManager) InternalContainerLifecycle() InternalContainerLifecycle {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "InternalContainerLifecycle")
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager(), memorymanager.NewFakeManager(), topologymanager.NewFakeManager()}
}
func (cm *FakeContainerManager) GetPodCgroupRoot() string {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupRoot")
return ""
}
func (cm *FakeContainerManager) GetDevices(_, _ string) []*podresourcesapi.ContainerDevices {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetDevices")
return nil
}
func (cm *FakeContainerManager) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetAllocatableDevices")
return nil
}
func (cm *FakeContainerManager) ShouldResetExtendedResourceCapacity() bool {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "ShouldResetExtendedResourceCapacity")
return cm.shouldResetExtendedResourceCapacity
}
func (cm *FakeContainerManager) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetAllocateResourcesPodAdmitHandler")
return topologymanager.NewFakeManager()
}
func (cm *FakeContainerManager) UpdateAllocatedDevices() {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdateAllocatedDevices")
return
}
func (cm *FakeContainerManager) GetCPUs(_, _ string) []int64 {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetCPUs")
return nil
}
func (cm *FakeContainerManager) GetAllocatableCPUs() []int64 {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetMemory")
return nil
}
func (cm *FakeContainerManager) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *FakeContainerManager) UnprepareDynamicResources(context.Context, *v1.Pod) error {
return nil
}
func (cm *FakeContainerManager) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}
func (cm *FakeContainerManager) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
}
func (cm *FakeContainerManager) Updates() <-chan resourceupdates.Update {
return nil
}

View File

@ -0,0 +1,40 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func NewFakeInternalContainerLifecycle() *fakeInternalContainerLifecycle {
return &fakeInternalContainerLifecycle{}
}
type fakeInternalContainerLifecycle struct{}
func (f *fakeInternalContainerLifecycle) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PostStopContainer(containerID string) error {
return nil
}

View File

@ -0,0 +1,127 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"sync"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type FakePodContainerManager struct {
sync.Mutex
CalledFunctions []string
Cgroups map[types.UID]CgroupName
}
var _ PodContainerManager = &FakePodContainerManager{}
func NewFakePodContainerManager() *FakePodContainerManager {
return &FakePodContainerManager{
Cgroups: make(map[types.UID]CgroupName),
}
}
func (m *FakePodContainerManager) AddPodFromCgroups(pod *kubecontainer.Pod) {
m.Lock()
defer m.Unlock()
m.Cgroups[pod.ID] = []string{pod.Name}
}
func (m *FakePodContainerManager) Exists(_ *v1.Pod) bool {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "Exists")
return true
}
func (m *FakePodContainerManager) EnsureExists(_ *v1.Pod) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "EnsureExists")
return nil
}
func (m *FakePodContainerManager) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "GetPodContainerName")
return nil, ""
}
func (m *FakePodContainerManager) Destroy(name CgroupName) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "Destroy")
for key, cgname := range m.Cgroups {
if reflect.DeepEqual(cgname, name) {
delete(m.Cgroups, key)
return nil
}
}
return nil
}
func (m *FakePodContainerManager) ReduceCPULimits(_ CgroupName) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "ReduceCPULimits")
return nil
}
func (m *FakePodContainerManager) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "GetAllPodsFromCgroups")
// return a copy for the race detector
grp := make(map[types.UID]CgroupName)
for key, value := range m.Cgroups {
grp[key] = value
}
return grp, nil
}
func (m *FakePodContainerManager) IsPodCgroup(cgroupfs string) (bool, types.UID) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "IsPodCgroup")
return false, types.UID("")
}
func (cm *FakePodContainerManager) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupMemoryUsage")
return 0, nil
}
func (cm *FakePodContainerManager) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupConfig")
return nil, nil
}
func (cm *FakePodContainerManager) SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "SetPodCgroupConfig")
return nil
}

89
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/helpers.go generated vendored Normal file
View File

@ -0,0 +1,89 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
// for typecheck across platforms
var _ func(int64, int64) int64 = MilliCPUToQuota
var _ func(int64) uint64 = MilliCPUToShares
var _ func(*v1.Pod, bool, uint64, bool) *ResourceConfig = ResourceConfigForPod
var _ func() (*CgroupSubsystems, error) = GetCgroupSubsystems
var _ func(string) ([]int, error) = getCgroupProcs
var _ func(types.UID) string = GetPodCgroupNameSuffix
var _ func(string, bool, string) string = NodeAllocatableRoot
var _ func(string) (string, error) = GetKubeletContainer
// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds.
func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList {
if len(thresholds) == 0 {
return nil
}
ret := v1.ResourceList{}
for _, threshold := range thresholds {
if threshold.Operator != evictionapi.OpLessThan {
continue
}
switch threshold.Signal {
case evictionapi.SignalMemoryAvailable:
memoryCapacity := capacity[v1.ResourceMemory]
value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity)
ret[v1.ResourceMemory] = *value
case evictionapi.SignalNodeFsAvailable:
storageCapacity := capacity[v1.ResourceEphemeralStorage]
value := evictionapi.GetThresholdQuantity(threshold.Value, &storageCapacity)
ret[v1.ResourceEphemeralStorage] = *value
}
}
return ret
}
func buildContainerMapAndRunningSetFromRuntime(ctx context.Context, runtimeService internalapi.RuntimeService) (containermap.ContainerMap, sets.Set[string]) {
podSandboxMap := make(map[string]string)
podSandboxList, _ := runtimeService.ListPodSandbox(ctx, nil)
for _, p := range podSandboxList {
podSandboxMap[p.Id] = p.Metadata.Uid
}
runningSet := sets.New[string]()
containerMap := containermap.NewContainerMap()
containerList, _ := runtimeService.ListContainers(ctx, nil)
for _, c := range containerList {
if _, exists := podSandboxMap[c.PodSandboxId]; !exists {
klog.InfoS("No PodSandBox found for the container", "podSandboxId", c.PodSandboxId, "containerName", c.Metadata.Name, "containerId", c.Id)
continue
}
podUID := podSandboxMap[c.PodSandboxId]
containerMap.Add(podUID, c.Metadata.Name, c.Id)
if c.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
klog.V(4).InfoS("Container reported running", "podSandboxId", c.PodSandboxId, "podUID", podUID, "containerName", c.Metadata.Name, "containerId", c.Id)
runningSet.Insert(c.Id)
}
}
return containerMap, runningSet
}

View File

@ -0,0 +1,343 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/util"
)
const (
// These limits are defined in the kernel:
// https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428
MinShares = 2
MaxShares = 262144
SharesPerCPU = 1024
MilliCPUToCPU = 1000
// 100000 microseconds is equivalent to 100ms
QuotaPeriod = 100000
// 1000 microseconds is equivalent to 1ms
// defined here:
// https://github.com/torvalds/linux/blob/cac03ac368fabff0122853de2422d4e17a32de08/kernel/sched/core.c#L10546
MinQuotaPeriod = 1000
// From the inverse of the conversion in MilliCPUToQuota:
// MinQuotaPeriod * MilliCPUToCPU / QuotaPeriod
MinMilliCPULimit = 10
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
// Input parameters and resulting value is number of microseconds.
func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across given by period)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
// so in the above example, you are limited to 20% of a single CPU
// for multi-cpu environments, you just scale equivalent amounts
// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details
if milliCPU == 0 {
return
}
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
period = QuotaPeriod
}
// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * period) / MilliCPUToCPU
// quota needs to be a minimum of 1ms.
if quota < MinQuotaPeriod {
quota = MinQuotaPeriod
}
return
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
if milliCPU == 0 {
// Docker converts zero milliCPU to unset, which maps to kernel default
// for unset: 1024. Return 2 here to really match kernel default for
// zero milliCPU.
return MinShares
}
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
if shares < MinShares {
return MinShares
}
if shares > MaxShares {
return MaxShares
}
return uint64(shares)
}
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodLevelResources)
// sum requests and limits.
reqs := resource.PodRequests(allocatedPod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !podLevelResourcesEnabled,
UseStatusResources: false,
})
// track if limits were applied for each resource.
memoryLimitsDeclared := true
cpuLimitsDeclared := true
limits := resource.PodLimits(allocatedPod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !podLevelResourcesEnabled,
ContainerFn: func(res v1.ResourceList, containerType resource.ContainerType) {
if res.Cpu().IsZero() {
cpuLimitsDeclared = false
}
if res.Memory().IsZero() {
memoryLimitsDeclared = false
}
},
})
if podLevelResourcesEnabled && resource.IsPodLevelResourcesSet(allocatedPod) {
if !allocatedPod.Spec.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = true
}
if !allocatedPod.Spec.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = true
}
}
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := HugePageLimits(reqs)
cpuRequests := int64(0)
cpuLimits := int64(0)
memoryLimits := int64(0)
if request, found := reqs[v1.ResourceCPU]; found {
cpuRequests = request.MilliValue()
}
if limit, found := limits[v1.ResourceCPU]; found {
cpuLimits = limit.MilliValue()
}
if limit, found := limits[v1.ResourceMemory]; found {
memoryLimits = limit.Value()
}
// convert to CFS values
cpuShares := MilliCPUToShares(cpuRequests)
cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))
// quota is not capped when cfs quota is disabled
if !enforceCPULimits {
cpuQuota = int64(-1)
}
// determine the qos class
qosClass := v1qos.GetPodQOS(allocatedPod)
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
result.CPUShares = &cpuShares
result.CPUQuota = &cpuQuota
result.CPUPeriod = &cpuPeriod
result.Memory = &memoryLimits
} else if qosClass == v1.PodQOSBurstable {
result.CPUShares = &cpuShares
if cpuLimitsDeclared {
result.CPUQuota = &cpuQuota
result.CPUPeriod = &cpuPeriod
}
if memoryLimitsDeclared {
result.Memory = &memoryLimits
}
} else {
shares := uint64(MinShares)
result.CPUShares = &shares
}
result.HugePageLimit = hugePageLimits
if enforceMemoryQoS {
memoryMin := int64(0)
if request, found := reqs[v1.ResourceMemory]; found {
memoryMin = request.Value()
}
if memoryMin > 0 {
result.Unified = map[string]string{
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}
return result
}
// getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems
func getCgroupSubsystemsV1() (*CgroupSubsystems, error) {
// get all cgroup mounts.
allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
if err != nil {
return &CgroupSubsystems{}, err
}
if len(allCgroups) == 0 {
return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
}
mountPoints := make(map[string]string, len(allCgroups))
for _, mount := range allCgroups {
// BEFORE kubelet used a random mount point per cgroups subsystem;
// NOW more deterministic: kubelet use mount point with shortest path;
// FUTURE is bright with clear expectation determined in doc.
// ref. issue: https://github.com/kubernetes/kubernetes/issues/95488
for _, subsystem := range mount.Subsystems {
previous := mountPoints[subsystem]
if previous == "" || len(mount.Mountpoint) < len(previous) {
mountPoints[subsystem] = mount.Mountpoint
}
}
}
return &CgroupSubsystems{
Mounts: allCgroups,
MountPoints: mountPoints,
}, nil
}
// getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems
func getCgroupSubsystemsV2() (*CgroupSubsystems, error) {
controllers, err := libcontainercgroups.GetAllSubsystems()
if err != nil {
return nil, err
}
mounts := []libcontainercgroups.Mount{}
mountPoints := make(map[string]string, len(controllers))
for _, controller := range controllers {
mountPoints[controller] = util.CgroupRoot
m := libcontainercgroups.Mount{
Mountpoint: util.CgroupRoot,
Root: util.CgroupRoot,
Subsystems: []string{controller},
}
mounts = append(mounts, m)
}
return &CgroupSubsystems{
Mounts: mounts,
MountPoints: mountPoints,
}, nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
if libcontainercgroups.IsCgroup2UnifiedMode() {
return getCgroupSubsystemsV2()
}
return getCgroupSubsystemsV1()
}
// getCgroupProcs takes a cgroup directory name as an argument
// reads through the cgroup's procs file and returns a list of tgid's.
// It returns an empty list if a procs file doesn't exists
func getCgroupProcs(dir string) ([]int, error) {
procsFile := filepath.Join(dir, "cgroup.procs")
f, err := os.Open(procsFile)
if err != nil {
if os.IsNotExist(err) {
// The procsFile does not exist, So no pids attached to this directory
return []int{}, nil
}
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
out := []int{}
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
}
out = append(out, pid)
}
}
return out, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return podCgroupNamePrefix + string(podUID)
}
// NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot)
if cgroupsPerQOS {
nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName)
}
if cgroupDriver == "systemd" {
return nodeAllocatableRoot.ToSystemd()
}
return nodeAllocatableRoot.ToCgroupfs()
}
// GetKubeletContainer returns the cgroup the kubelet will use
func GetKubeletContainer(kubeletCgroups string) (string, error) {
if kubeletCgroups == "" {
cont, err := getContainer(os.Getpid())
if err != nil {
return "", err
}
return cont, nil
}
return kubeletCgroups, nil
}

View File

@ -0,0 +1,76 @@
//go:build !linux
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
const (
MinShares = 0
MaxShares = 0
SharesPerCPU = 0
MilliCPUToCPU = 0
QuotaPeriod = 0
MinQuotaPeriod = 0
MinMilliCPULimit = 0
)
// MilliCPUToQuota converts milliCPU and period to CFS quota values.
func MilliCPUToQuota(milliCPU, period int64) int64 {
return 0
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
return 0
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
return nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
return nil, nil
}
func getCgroupProcs(dir string) ([]int, error) {
return nil, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return ""
}
// NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
return ""
}
// GetKubeletContainer returns the cgroup the kubelet will use
func GetKubeletContainer(kubeletCgroups string) (string, error) {
return "", nil
}

View File

@ -0,0 +1,56 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
type InternalContainerLifecycle interface {
PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error
PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error
PostStopContainer(containerID string) error
}
// Implements InternalContainerLifecycle interface.
type internalContainerLifecycleImpl struct {
cpuManager cpumanager.Manager
memoryManager memorymanager.Manager
topologyManager topologymanager.Manager
}
func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
if i.cpuManager != nil {
i.cpuManager.AddContainer(pod, container, containerID)
}
if i.memoryManager != nil {
i.memoryManager.AddContainer(pod, container, containerID)
}
i.topologyManager.AddContainer(pod, container, containerID)
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
return i.topologyManager.RemoveContainer(containerID)
}

View File

@ -0,0 +1,51 @@
//go:build linux
// +build linux
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
if i.cpuManager != nil {
allocatedCPUs := i.cpuManager.GetCPUAffinity(string(pod.UID), container.Name)
if !allocatedCPUs.IsEmpty() {
containerConfig.Linux.Resources.CpusetCpus = allocatedCPUs.String()
}
}
if i.memoryManager != nil {
numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container)
if numaNodes.Len() > 0 {
var affinity []string
for _, numaNode := range sets.List(numaNodes) {
affinity = append(affinity, strconv.Itoa(numaNode))
}
containerConfig.Linux.Resources.CpusetMems = strings.Join(affinity, ",")
}
}
return nil
}

View File

@ -0,0 +1,29 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
return nil
}

View File

@ -0,0 +1,141 @@
//go:build windows
// +build windows
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/winstats"
"k8s.io/utils/cpuset"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
return nil
}
klog.V(4).Info("PreCreateContainer for Windows")
// retrieve CPU and NUMA affinity from CPU Manager and Memory Manager (if enabled)
var allocatedCPUs cpuset.CPUSet
if i.cpuManager != nil {
allocatedCPUs = i.cpuManager.GetCPUAffinity(string(pod.UID), container.Name)
}
var numaNodes sets.Set[int]
if i.memoryManager != nil {
numaNodes = i.memoryManager.GetMemoryNUMANodes(pod, container)
}
// Gather all CPUs associated with the selected NUMA nodes
var allNumaNodeCPUs []winstats.GroupAffinity
for _, numaNode := range sets.List(numaNodes) {
affinity, err := winstats.GetCPUsforNUMANode(uint16(numaNode))
if err != nil {
return fmt.Errorf("failed to get CPUs for NUMA node %d: %v", numaNode, err)
}
allNumaNodeCPUs = append(allNumaNodeCPUs, *affinity)
}
var finalCPUSet = computeFinalCpuSet(allocatedCPUs, allNumaNodeCPUs)
klog.V(4).InfoS("Setting CPU affinity", "affinity", finalCPUSet, "container", container.Name, "pod", pod.UID)
// Set CPU group affinities in the container config
if finalCPUSet != nil {
var cpusToGroupAffinities []*runtimeapi.WindowsCpuGroupAffinity
for group, mask := range groupMasks(finalCPUSet) {
cpusToGroupAffinities = append(cpusToGroupAffinities, &runtimeapi.WindowsCpuGroupAffinity{
CpuGroup: uint32(group),
CpuMask: uint64(mask),
})
}
containerConfig.Windows.Resources.AffinityCpus = cpusToGroupAffinities
}
// return nil if no CPUs were selected
return nil
}
// computeFinalCpuSet determines the final set of CPUs to use based on the CPU and memory managers
// and is extracted so that it can be tested
func computeFinalCpuSet(allocatedCPUs cpuset.CPUSet, allNumaNodeCPUs []winstats.GroupAffinity) sets.Set[int] {
if !allocatedCPUs.IsEmpty() && len(allNumaNodeCPUs) > 0 {
// Both CPU and memory managers are enabled
numaNodeAffinityCPUSet := computeCPUSet(allNumaNodeCPUs)
cpuManagerAffinityCPUSet := sets.New[int](allocatedCPUs.List()...)
// Determine which set of CPUs to use using the following logic outlined in the KEP:
// Case 1: CPU manager selects more CPUs than those available in the NUMA nodes selected by the memory manager
// Case 2: CPU manager selects fewer CPUs, and they all fall within the CPUs available in the NUMA nodes selected by the memory manager
// Case 3: CPU manager selects fewer CPUs, but some are outside of the CPUs available in the NUMA nodes selected by the memory manager
if cpuManagerAffinityCPUSet.Len() > numaNodeAffinityCPUSet.Len() {
// Case 1, use CPU manager selected CPUs
return cpuManagerAffinityCPUSet
} else if numaNodeAffinityCPUSet.IsSuperset(cpuManagerAffinityCPUSet) {
// case 2, use CPU manager selected CPUstry
return cpuManagerAffinityCPUSet
} else {
// Case 3, merge CPU manager and memory manager selected CPUs
return cpuManagerAffinityCPUSet.Union(numaNodeAffinityCPUSet)
}
} else if !allocatedCPUs.IsEmpty() {
// Only CPU manager is enabled, use CPU manager selected CPUs
return sets.New[int](allocatedCPUs.List()...)
} else if len(allNumaNodeCPUs) > 0 {
// Only memory manager is enabled, use CPUs associated with selected NUMA nodes
return computeCPUSet(allNumaNodeCPUs)
}
return nil
}
// computeCPUSet converts a list of GroupAffinity to a set of CPU IDs
func computeCPUSet(affinities []winstats.GroupAffinity) sets.Set[int] {
cpuSet := sets.New[int]()
for _, affinity := range affinities {
for i := 0; i < 64; i++ {
if (affinity.Mask>>i)&1 == 1 {
cpuID := int(affinity.Group)*64 + i
cpuSet.Insert(cpuID)
}
}
}
return cpuSet
}
// groupMasks converts a set of CPU IDs into group and mask representations
func groupMasks(cpuSet sets.Set[int]) map[int]uint64 {
groupMasks := make(map[int]uint64)
for cpu := range cpuSet {
group := cpu / 64
mask := uint64(1) << (cpu % 64)
groupMasks[group] |= mask
}
return groupMasks
}

View File

@ -0,0 +1,94 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Start()")
return nil
}
func (m *fakeManager) Policy() Policy {
klog.InfoS("Policy()")
return NewPolicyNone()
}
func (m *fakeManager) Allocate(pod *v1.Pod, container *v1.Container) error {
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
klog.InfoS("Add container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID)
}
func (m *fakeManager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int] {
klog.InfoS("Get MemoryNUMANodes", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) RemoveContainer(containerID string) error {
klog.InfoS("RemoveContainer", "containerID", containerID)
return nil
}
func (m *fakeManager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get Topology Hints", "pod", klog.KObj(pod), "containerName", container.Name)
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get Pod Topology Hints", "pod", klog.KObj(pod))
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) State() state.Reader {
return m.state
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (m *fakeManager) GetAllocatableMemory() []state.Block {
klog.InfoS("Get Allocatable Memory")
return []state.Block{}
}
// GetMemory returns the memory allocated by a container from NUMA nodes
func (m *fakeManager) GetMemory(podUID, containerName string) []state.Block {
klog.InfoS("Get Memory", "podUID", podUID, "containerName", containerName)
return []state.Block{}
}
// NewFakeManager creates empty/fake memory manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View File

@ -0,0 +1,467 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
"context"
"fmt"
"runtime"
"sync"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
corev1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
// memoryManagerStateFileName is the file name where memory manager stores its state
const memoryManagerStateFileName = "memory_manager_state"
// ActivePodsFunc is a function that returns a list of active pods
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(ctx context.Context, id string, resources *runtimeapi.ContainerResources) error
}
type sourcesReadyStub struct{}
func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// Manager interface provides methods for Kubelet to manage pod memory.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error
// AddContainer adds the mapping between container ID to pod UID and the container name
// The mapping used to remove the memory allocation during the container removal
AddContainer(p *v1.Pod, c *v1.Container, containerID string)
// Allocate is called to pre-allocate memory resources during Pod admission.
// This must be called at some point prior to the AddContainer() call for a container, e.g. at pod admission time.
Allocate(pod *v1.Pod, container *v1.Container) error
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, any memory allocated to the container is freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal memory manager state.
State() state.Reader
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(*v1.Pod, *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetPodTopologyHints(*v1.Pod) map[string][]topologymanager.TopologyHint
// GetMemoryNUMANodes provides NUMA nodes that are used to allocate the container memory
GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int]
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
GetAllocatableMemory() []state.Block
// GetMemory returns the memory allocated by a container from NUMA nodes
GetMemory(podUID, containerName string) []state.Block
}
type manager struct {
sync.Mutex
policy Policy
// state allows to restore information regarding memory allocation for guaranteed pods
// in the case of the kubelet restart
state state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated during call to the removeStaleState.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
// containerMap provides a mapping from (pod, container) -> containerID
// for all containers a pod
containerMap containermap.ContainerMap
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config.SourcesReady
// stateFileDirectory holds the directory where the state file for checkpoints is held.
stateFileDirectory string
// allocatableMemory holds the allocatable memory for each NUMA node
allocatableMemory []state.Block
}
var _ Manager = &manager{}
// NewManager returns new instance of the memory manager
func NewManager(policyName string, machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
var policy Policy
switch policyType(policyName) {
case policyTypeNone:
policy = NewPolicyNone()
case policyTypeStatic:
if runtime.GOOS == "windows" {
return nil, fmt.Errorf("policy %q is not available on Windows", policyTypeStatic)
}
systemReserved, err := getSystemReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory)
if err != nil {
return nil, err
}
policy, err = NewPolicyStatic(machineInfo, systemReserved, affinity)
if err != nil {
return nil, err
}
case policyTypeBestEffort:
if runtime.GOOS == "windows" {
systemReserved, err := getSystemReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory)
if err != nil {
return nil, err
}
policy, err = NewPolicyBestEffort(machineInfo, systemReserved, affinity)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("policy %q is not available for platform %q", policyTypeBestEffort, runtime.GOOS)
}
default:
return nil, fmt.Errorf("unknown policy: %q", policyName)
}
manager := &manager{
policy: policy,
stateFileDirectory: stateFileDirectory,
}
manager.sourcesReady = &sourcesReadyStub{}
return manager, nil
}
// Start starts the memory manager under the kubelet and calls policy start
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Starting memorymanager", "policy", m.policy.Name())
m.sourcesReady = sourcesReady
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.containerMap = initialContainers
stateImpl, err := state.NewCheckpointState(m.stateFileDirectory, memoryManagerStateFileName, m.policy.Name())
if err != nil {
klog.ErrorS(err, "Could not initialize checkpoint manager, please drain node and remove policy state file")
return err
}
m.state = stateImpl
err = m.policy.Start(m.state)
if err != nil {
klog.ErrorS(err, "Policy start error")
return err
}
m.allocatableMemory = m.policy.GetAllocatableMemory(m.state)
return nil
}
// AddContainer saves the value of requested memory for the guaranteed pod under the state and set memory affinity according to the topolgy manager
func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
m.Lock()
defer m.Unlock()
m.containerMap.Add(string(pod.UID), container.Name, containerID)
// Since we know that each init container always runs to completion before
// the next container starts, we can safely remove references to any previously
// started init containers. This will free up the memory from these init containers
// for use in other pods. If the current container happens to be an init container,
// we skip deletion of it until the next container is added, and this is called again.
for _, initContainer := range pod.Spec.InitContainers {
if initContainer.Name == container.Name {
break
}
// Since a restartable init container remains running for the full
// duration of the pod's lifecycle, we should not remove it from the
// memory manager state.
if podutil.IsRestartableInitContainer(&initContainer) {
continue
}
m.policyRemoveContainerByRef(string(pod.UID), initContainer.Name)
}
}
// GetMemoryNUMANodes provides NUMA nodes that used to allocate the container memory
func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int] {
// Get NUMA node affinity of blocks assigned to the container during Allocate()
numaNodes := sets.New[int]()
for _, block := range m.state.GetMemoryBlocks(string(pod.UID), container.Name) {
for _, nodeID := range block.NUMAAffinity {
// avoid nodes duplication when hugepages and memory blocks pinned to the same NUMA node
numaNodes.Insert(nodeID)
}
}
if numaNodes.Len() == 0 {
klog.V(5).InfoS("No allocation is available", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
klog.InfoS("Memory affinity", "pod", klog.KObj(pod), "containerName", container.Name, "numaNodes", numaNodes)
return numaNodes
}
// Allocate is called to pre-allocate memory resources during Pod admission.
func (m *manager) Allocate(pod *v1.Pod, container *v1.Container) error {
// Garbage collect any stranded resources before allocation
m.removeStaleState()
m.Lock()
defer m.Unlock()
// Call down into the policy to assign this container memory if required.
if err := m.policy.Allocate(m.state, pod, container); err != nil {
klog.ErrorS(err, "Allocate error")
return err
}
return nil
}
// RemoveContainer removes the container from the state
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
// if error appears it means container entry already does not exist under the container map
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
klog.InfoS("Failed to get container from container map", "containerID", containerID, "err", err)
return nil
}
m.policyRemoveContainerByRef(podUID, containerName)
return nil
}
// State returns the state of the manager
func (m *manager) State() state.Reader {
return m.state
}
// GetPodTopologyHints returns the topology hints for the topology manager
func (m *manager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetPodTopologyHints(m.state, pod)
}
// GetTopologyHints returns the topology hints for the topology manager
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetTopologyHints(m.state, pod, container)
}
// TODO: move the method to the upper level, to re-use it under the CPU and memory managers
func (m *manager) removeStaleState() {
// Only once all sources are ready do we attempt to remove any stale state.
// This ensures that the call to `m.activePods()` below will succeed with
// the actual active pods list.
if !m.sourcesReady.AllReady() {
return
}
// We grab the lock to ensure that no new containers will grab memory block while
// executing the code below. Without this lock, its possible that we end up
// removing state that is newly added by an asynchronous call to
// AddContainer() during the execution of this code.
m.Lock()
defer m.Unlock()
// Get the list of active pods.
activePods := m.activePods()
// Build a list of (podUID, containerName) pairs for all containers in all active Pods.
activeContainers := make(map[string]map[string]struct{})
for _, pod := range activePods {
activeContainers[string(pod.UID)] = make(map[string]struct{})
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
activeContainers[string(pod.UID)][container.Name] = struct{}{}
}
}
// Loop through the MemoryManager state. Remove any state for containers not
// in the `activeContainers` list built above.
assignments := m.state.GetMemoryAssignments()
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
}
}
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
})
}
func (m *manager) policyRemoveContainerByRef(podUID string, containerName string) {
m.policy.RemoveContainer(m.state, podUID, containerName)
m.containerMap.RemoveByContainerRef(podUID, containerName)
}
func getTotalMemoryTypeReserved(machineInfo *cadvisorapi.MachineInfo, reservedMemory []kubeletconfig.MemoryReservation) (map[v1.ResourceName]resource.Quantity, error) {
totalMemoryType := map[v1.ResourceName]resource.Quantity{}
numaNodes := map[int]bool{}
for _, numaNode := range machineInfo.Topology {
numaNodes[numaNode.Id] = true
}
for _, reservation := range reservedMemory {
if !numaNodes[int(reservation.NumaNode)] {
return nil, fmt.Errorf("the reserved memory configuration references a NUMA node %d that does not exist on this machine", reservation.NumaNode)
}
for resourceName, q := range reservation.Limits {
if value, ok := totalMemoryType[resourceName]; ok {
q.Add(value)
}
totalMemoryType[resourceName] = q
}
}
return totalMemoryType, nil
}
func validateReservedMemory(machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation) error {
totalMemoryType, err := getTotalMemoryTypeReserved(machineInfo, reservedMemory)
if err != nil {
return err
}
commonMemoryTypeSet := make(map[v1.ResourceName]bool)
for resourceType := range totalMemoryType {
commonMemoryTypeSet[resourceType] = true
}
for resourceType := range nodeAllocatableReservation {
if !(corev1helper.IsHugePageResourceName(resourceType) || resourceType == v1.ResourceMemory) {
continue
}
commonMemoryTypeSet[resourceType] = true
}
for resourceType := range commonMemoryTypeSet {
nodeAllocatableMemory := resource.NewQuantity(0, resource.DecimalSI)
if memValue, set := nodeAllocatableReservation[resourceType]; set {
nodeAllocatableMemory.Add(memValue)
}
reservedMemory := resource.NewQuantity(0, resource.DecimalSI)
if memValue, set := totalMemoryType[resourceType]; set {
reservedMemory.Add(memValue)
}
if !(*nodeAllocatableMemory).Equal(*reservedMemory) {
return fmt.Errorf("the total amount %q of type %q is not equal to the value %q determined by Node Allocatable feature", reservedMemory.String(), resourceType, nodeAllocatableMemory.String())
}
}
return nil
}
func convertReserved(machineInfo *cadvisorapi.MachineInfo, reservedMemory []kubeletconfig.MemoryReservation) (systemReservedMemory, error) {
reservedMemoryConverted := make(map[int]map[v1.ResourceName]uint64)
for _, node := range machineInfo.Topology {
reservedMemoryConverted[node.Id] = make(map[v1.ResourceName]uint64)
}
for _, reservation := range reservedMemory {
for resourceName, q := range reservation.Limits {
val, success := q.AsInt64()
if !success {
return nil, fmt.Errorf("could not covert a variable of type Quantity to int64")
}
reservedMemoryConverted[int(reservation.NumaNode)][resourceName] = uint64(val)
}
}
return reservedMemoryConverted, nil
}
func getSystemReservedMemory(machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation) (systemReservedMemory, error) {
if err := validateReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory); err != nil {
return nil, err
}
reservedMemoryConverted, err := convertReserved(machineInfo, reservedMemory)
if err != nil {
return nil, err
}
return reservedMemoryConverted, nil
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (m *manager) GetAllocatableMemory() []state.Block {
return m.allocatableMemory
}
// GetMemory returns the memory allocated by a container from NUMA nodes
func (m *manager) GetMemory(podUID, containerName string) []state.Block {
return m.state.GetMemoryBlocks(podUID, containerName)
}

View File

@ -0,0 +1,46 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// Type defines the policy type
type policyType string
// Policy implements logic for pod container to a memory assignment.
type Policy interface {
Name() string
Start(s state.State) error
// Allocate call is idempotent
Allocate(s state.State, pod *v1.Pod, container *v1.Container) error
// RemoveContainer call is idempotent
RemoveContainer(s state.State, podUID string, containerName string)
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
GetAllocatableMemory(s state.State) []state.Block
}

View File

@ -0,0 +1,80 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// On Windows we want to use the same logic as the StaticPolicy to compute the memory topology hints
// but unlike linux based systems, on Windows systems numa nodes cannot be directly assigned or guaranteed via Windows APIs
// (windows scheduler will use the numa node that is closest to the cpu assigned therefor respecting the numa node assignment as a best effort). Because of this we don't want to have users specify "StaticPolicy" for the memory manager
// policy via kubelet configuration. Instead we want to use the "BestEffort" policy which will use the same logic as the StaticPolicy
// and doing so will reduce code duplication.
const policyTypeBestEffort policyType = "BestEffort"
// bestEffortPolicy is implementation of the policy interface for the BestEffort policy
type bestEffortPolicy struct {
static *staticPolicy
}
var _ Policy = &bestEffortPolicy{}
func NewPolicyBestEffort(machineInfo *cadvisorapi.MachineInfo, reserved systemReservedMemory, affinity topologymanager.Store) (Policy, error) {
p, err := NewPolicyStatic(machineInfo, reserved, affinity)
if err != nil {
return nil, err
}
return &bestEffortPolicy{
static: p.(*staticPolicy),
}, nil
}
func (p *bestEffortPolicy) Name() string {
return string(policyTypeBestEffort)
}
func (p *bestEffortPolicy) Start(s state.State) error {
return p.static.Start(s)
}
func (p *bestEffortPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
return p.static.Allocate(s, pod, container)
}
func (p *bestEffortPolicy) RemoveContainer(s state.State, podUID string, containerName string) {
p.static.RemoveContainer(s, podUID, containerName)
}
func (p *bestEffortPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return p.static.GetPodTopologyHints(s, pod)
}
func (p *bestEffortPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return p.static.GetTopologyHints(s, pod, container)
}
func (p *bestEffortPolicy) GetAllocatableMemory(s state.State) []state.Block {
return p.static.GetAllocatableMemory(s)
}

View File

@ -0,0 +1,72 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
const policyTypeNone policyType = "None"
// none is implementation of the policy interface for the none policy, using none
// policy is the same as disable memory management
type none struct{}
var _ Policy = &none{}
// NewPolicyNone returns new none policy instance
func NewPolicyNone() Policy {
return &none{}
}
func (p *none) Name() string {
return string(policyTypeNone)
}
func (p *none) Start(s state.State) error {
return nil
}
// Allocate call is idempotent
func (p *none) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
return nil
}
// RemoveContainer call is idempotent
func (p *none) RemoveContainer(s state.State, podUID string, containerName string) {
}
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
func (p *none) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return nil
}
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
func (p *none) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return nil
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (p *none) GetAllocatableMemory(s state.State) []state.Block {
return []state.Block{}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
var _ checkpointmanager.Checkpoint = &MemoryManagerCheckpoint{}
// MemoryManagerCheckpoint struct is used to store memory/pod assignments in a checkpoint
type MemoryManagerCheckpoint struct {
PolicyName string `json:"policyName"`
MachineState NUMANodeMap `json:"machineState"`
Entries ContainerMemoryAssignments `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// NewMemoryManagerCheckpoint returns an instance of Checkpoint
func NewMemoryManagerCheckpoint() *MemoryManagerCheckpoint {
//nolint:staticcheck // unexported-type-in-api user-facing error message
return &MemoryManagerCheckpoint{
Entries: ContainerMemoryAssignments{},
MachineState: NUMANodeMap{},
}
}
// MarshalCheckpoint returns marshalled checkpoint
func (mp *MemoryManagerCheckpoint) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
mp.Checksum = 0
mp.Checksum = checksum.New(mp)
return json.Marshal(*mp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint
func (mp *MemoryManagerCheckpoint) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, mp)
}
// VerifyChecksum verifies that current checksum of checkpoint is valid
func (mp *MemoryManagerCheckpoint) VerifyChecksum() error {
ck := mp.Checksum
mp.Checksum = 0
err := ck.Verify(mp)
mp.Checksum = ck
return err
}

View File

@ -0,0 +1,130 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
v1 "k8s.io/api/core/v1"
)
// MemoryTable contains memory information
type MemoryTable struct {
TotalMemSize uint64 `json:"total"`
SystemReserved uint64 `json:"systemReserved"`
Allocatable uint64 `json:"allocatable"`
Reserved uint64 `json:"reserved"`
Free uint64 `json:"free"`
}
// NUMANodeState contains NUMA node related information
type NUMANodeState struct {
// NumberOfAssignments contains a number memory assignments from this node
// When the container requires memory and hugepages it will increase number of assignments by two
NumberOfAssignments int `json:"numberOfAssignments"`
// MemoryTable contains NUMA node memory related information
MemoryMap map[v1.ResourceName]*MemoryTable `json:"memoryMap"`
// Cells contains the current NUMA node and all other nodes that are in a group with current NUMA node
// This parameter indicates if the current node is used for the multiple NUMA node memory allocation
// For example if some container has pinning 0,1,2, NUMA nodes 0,1,2 under the state will have
// this parameter equals to [0, 1, 2]
Cells []int `json:"cells"`
}
// NUMANodeMap contains memory information for each NUMA node.
type NUMANodeMap map[int]*NUMANodeState
// Clone returns a copy of NUMANodeMap
func (nm NUMANodeMap) Clone() NUMANodeMap {
clone := make(NUMANodeMap)
for node, s := range nm {
if s == nil {
clone[node] = nil
continue
}
clone[node] = &NUMANodeState{}
clone[node].NumberOfAssignments = s.NumberOfAssignments
clone[node].Cells = append([]int{}, s.Cells...)
if s.MemoryMap == nil {
continue
}
clone[node].MemoryMap = map[v1.ResourceName]*MemoryTable{}
for memoryType, memoryTable := range s.MemoryMap {
clone[node].MemoryMap[memoryType] = &MemoryTable{
Allocatable: memoryTable.Allocatable,
Free: memoryTable.Free,
Reserved: memoryTable.Reserved,
SystemReserved: memoryTable.SystemReserved,
TotalMemSize: memoryTable.TotalMemSize,
}
}
}
return clone
}
// Block is a data structure used to represent a certain amount of memory
type Block struct {
// NUMAAffinity contains the string that represents NUMA affinity bitmask
NUMAAffinity []int `json:"numaAffinity"`
Type v1.ResourceName `json:"type"`
Size uint64 `json:"size"`
}
// ContainerMemoryAssignments stores memory assignments of containers
type ContainerMemoryAssignments map[string]map[string][]Block
// Clone returns a copy of ContainerMemoryAssignments
func (as ContainerMemoryAssignments) Clone() ContainerMemoryAssignments {
clone := make(ContainerMemoryAssignments)
for pod := range as {
clone[pod] = make(map[string][]Block)
for container, blocks := range as[pod] {
clone[pod][container] = append([]Block{}, blocks...)
}
}
return clone
}
// Reader interface used to read current memory/pod assignment state
type Reader interface {
// GetMachineState returns Memory Map stored in the State
GetMachineState() NUMANodeMap
// GetMemoryBlocks returns memory assignments of a container
GetMemoryBlocks(podUID string, containerName string) []Block
// GetMemoryAssignments returns ContainerMemoryAssignments
GetMemoryAssignments() ContainerMemoryAssignments
}
type writer interface {
// SetMachineState stores NUMANodeMap in State
SetMachineState(memoryMap NUMANodeMap)
// SetMemoryBlocks stores memory assignments of a container
SetMemoryBlocks(podUID string, containerName string, blocks []Block)
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
SetMemoryAssignments(assignments ContainerMemoryAssignments)
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
Delete(podUID string, containerName string)
// ClearState clears machineState and ContainerMemoryAssignments
ClearState()
}
// State interface provides methods for tracking and setting memory/pod assignment
type State interface {
Reader
writer
}

View File

@ -0,0 +1,184 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"fmt"
"path/filepath"
"sync"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ State = &stateCheckpoint{}
type stateCheckpoint struct {
sync.RWMutex
cache State
policyName string
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
}
// NewCheckpointState creates new State for keeping track of memory/pod assignment with checkpoint backend
func NewCheckpointState(stateDir, checkpointName, policyName string) (State, error) {
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
}
stateCheckpoint := &stateCheckpoint{
cache: NewMemoryState(),
policyName: policyName,
checkpointManager: checkpointManager,
checkpointName: checkpointName,
}
if err := stateCheckpoint.restoreState(); err != nil {
//nolint:staticcheck // ST1005 user-facing error message
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the memory manager checkpoint file %q before restarting Kubelet",
err, filepath.Join(stateDir, checkpointName))
}
return stateCheckpoint, nil
}
// restores state from a checkpoint and creates it if it doesn't exist
func (sc *stateCheckpoint) restoreState() error {
sc.Lock()
defer sc.Unlock()
var err error
checkpoint := NewMemoryManagerCheckpoint()
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
if err == errors.ErrCheckpointNotFound {
return sc.storeState()
}
return err
}
if sc.policyName != checkpoint.PolicyName {
return fmt.Errorf("[memorymanager] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName)
}
sc.cache.SetMachineState(checkpoint.MachineState)
sc.cache.SetMemoryAssignments(checkpoint.Entries)
klog.V(2).InfoS("State checkpoint: restored state from checkpoint")
return nil
}
// saves state to a checkpoint, caller is responsible for locking
func (sc *stateCheckpoint) storeState() error {
checkpoint := NewMemoryManagerCheckpoint()
checkpoint.PolicyName = sc.policyName
checkpoint.MachineState = sc.cache.GetMachineState()
checkpoint.Entries = sc.cache.GetMemoryAssignments()
err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
if err != nil {
klog.ErrorS(err, "Could not save checkpoint")
return err
}
return nil
}
// GetMemoryState returns Memory Map stored in the State
func (sc *stateCheckpoint) GetMachineState() NUMANodeMap {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMachineState()
}
// GetMemoryBlocks returns memory assignments of a container
func (sc *stateCheckpoint) GetMemoryBlocks(podUID string, containerName string) []Block {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMemoryBlocks(podUID, containerName)
}
// GetMemoryAssignments returns ContainerMemoryAssignments
func (sc *stateCheckpoint) GetMemoryAssignments() ContainerMemoryAssignments {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMemoryAssignments()
}
// SetMachineState stores NUMANodeMap in State
func (sc *stateCheckpoint) SetMachineState(memoryMap NUMANodeMap) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMachineState(memoryMap)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetMemoryBlocks stores memory assignments of container
func (sc *stateCheckpoint) SetMemoryBlocks(podUID string, containerName string, blocks []Block) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMemoryBlocks(podUID, containerName, blocks)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
func (sc *stateCheckpoint) SetMemoryAssignments(assignments ContainerMemoryAssignments) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMemoryAssignments(assignments)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.Lock()
defer sc.Unlock()
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// ClearState clears machineState and ContainerMemoryAssignments
func (sc *stateCheckpoint) ClearState() {
sc.Lock()
defer sc.Unlock()
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"k8s.io/klog/v2"
)
type stateMemory struct {
sync.RWMutex
assignments ContainerMemoryAssignments
machineState NUMANodeMap
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
klog.InfoS("Initializing new in-memory state store")
return &stateMemory{
assignments: ContainerMemoryAssignments{},
machineState: NUMANodeMap{},
}
}
// GetMemoryState returns Memory Map stored in the State
func (s *stateMemory) GetMachineState() NUMANodeMap {
s.RLock()
defer s.RUnlock()
return s.machineState.Clone()
}
// GetMemoryBlocks returns memory assignments of a container
func (s *stateMemory) GetMemoryBlocks(podUID string, containerName string) []Block {
s.RLock()
defer s.RUnlock()
if res, ok := s.assignments[podUID][containerName]; ok {
return append([]Block{}, res...)
}
return nil
}
// GetMemoryAssignments returns ContainerMemoryAssignments
func (s *stateMemory) GetMemoryAssignments() ContainerMemoryAssignments {
s.RLock()
defer s.RUnlock()
return s.assignments.Clone()
}
// SetMachineState stores NUMANodeMap in State
func (s *stateMemory) SetMachineState(nodeMap NUMANodeMap) {
s.Lock()
defer s.Unlock()
s.machineState = nodeMap.Clone()
klog.InfoS("Updated machine memory state")
}
// SetMemoryBlocks stores memory assignments of container
func (s *stateMemory) SetMemoryBlocks(podUID string, containerName string, blocks []Block) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
s.assignments[podUID] = map[string][]Block{}
}
s.assignments[podUID][containerName] = append([]Block{}, blocks...)
klog.InfoS("Updated memory state", "podUID", podUID, "containerName", containerName)
}
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
func (s *stateMemory) SetMemoryAssignments(assignments ContainerMemoryAssignments) {
s.Lock()
defer s.Unlock()
s.assignments = assignments.Clone()
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
func (s *stateMemory) Delete(podUID string, containerName string) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
return
}
delete(s.assignments[podUID], containerName)
if len(s.assignments[podUID]) == 0 {
delete(s.assignments, podUID)
}
klog.V(2).InfoS("Deleted memory assignment", "podUID", podUID, "containerName", containerName)
}
// ClearState clears machineState and ContainerMemoryAssignments
func (s *stateMemory) ClearState() {
s.Lock()
defer s.Unlock()
s.machineState = NUMANodeMap{}
s.assignments = make(ContainerMemoryAssignments)
klog.V(2).InfoS("Cleared state")
}

View File

@ -0,0 +1,328 @@
//go:build linux
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)
const (
defaultNodeAllocatableCgroupName = "kubepods"
)
// createNodeAllocatableCgroups creates Node Allocatable Cgroup when CgroupsPerQOS flag is specified as true
func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
nodeAllocatable := cm.internalCapacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
nc := cm.NodeConfig.NodeAllocatableConfig
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
}
cgroupConfig := &CgroupConfig{
Name: cm.cgroupRoot,
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
}
if cm.cgroupManager.Exists(cgroupConfig.Name) {
return nil
}
if err := cm.cgroupManager.Create(cgroupConfig); err != nil {
klog.ErrorS(err, "Failed to create cgroup", "cgroupName", cm.cgroupRoot)
return err
}
return nil
}
// enforceNodeAllocatableCgroups enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
nc := cm.NodeConfig.NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.internalCapacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
}
klog.V(4).InfoS("Attempting to enforce Node Allocatable", "config", nc)
cgroupConfig := &CgroupConfig{
Name: cm.cgroupRoot,
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
nodeRef := nodeRefFromNode(cm.nodeInfo.Name)
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher than current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
// Check if cgroupRoot is set to a non-empty value (empty would be the root container)
if len(cm.cgroupRoot) > 0 {
go func() {
for {
err := cm.cgroupManager.Update(cgroupConfig)
if err == nil {
cm.recorder.Event(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated Node Allocatable limit across pods")
return
}
message := fmt.Sprintf("Failed to update Node Allocatable Limits %q: %v", cm.cgroupRoot, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
time.Sleep(time.Minute)
}
}()
}
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
klog.V(2).InfoS("Enforcing system reserved on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, false); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
klog.V(2).InfoS("Enforcing kube reserved on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, false); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedCompressibleEnforcementKey) {
klog.V(2).InfoS("Enforcing system reserved compressible on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, true); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Compressible Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedCompressibleEnforcementKey) {
klog.V(2).InfoS("Enforcing kube reserved compressible on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, true); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Compressible Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
return nil
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func (cm *containerManagerImpl) enforceExistingCgroup(cNameStr string, rl v1.ResourceList, compressibleResources bool) error {
cName := cm.cgroupManager.CgroupName(cNameStr)
rp := cm.getCgroupConfig(rl, compressibleResources)
if rp == nil {
return fmt.Errorf("%q cgroup is not configured properly", cName)
}
// Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information,
// see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) {
if rp.Memory != nil {
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}
cgroupConfig := &CgroupConfig{
Name: cName,
ResourceParameters: rp,
}
klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CPUShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit)
if err := cm.cgroupManager.Validate(cgroupConfig.Name); err != nil {
return err
}
if err := cm.cgroupManager.Update(cgroupConfig); err != nil {
return err
}
return nil
}
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
func (cm *containerManagerImpl) getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
rc := getCgroupConfigInternal(rl, compressibleResourcesOnly)
if rc == nil {
return nil
}
// In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
// By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
// However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
// doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
// An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
// and this is sufficient.
// Only do so on None policy, as Static policy will do its own updating of the cpuset.
// Please see the comment on policy none's GetAllocatableCPUs
if cm.cpuManager.GetAllocatableCPUs().IsEmpty() {
rc.CPUSet = cm.cpuManager.GetAllCPUs()
}
return rc
}
// getCgroupConfigInternal are the pieces of getCgroupConfig that don't require the cm object.
// This is added to unit test without needing to create a full containerManager
func getCgroupConfigInternal(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
// TODO(vishh): Set CPU Quota if necessary.
if rl == nil {
return nil
}
var rc ResourceConfig
setCompressibleResources := func() {
if q, exists := rl[v1.ResourceCPU]; exists {
// CPU is defined in milli-cores.
val := MilliCPUToShares(q.MilliValue())
rc.CPUShares = &val
}
}
// Only return compressible resources
if compressibleResourcesOnly {
setCompressibleResources()
} else {
if q, exists := rl[v1.ResourceMemory]; exists {
// Memory is defined in bytes.
val := q.Value()
rc.Memory = &val
}
setCompressibleResources()
if q, exists := rl[pidlimit.PIDs]; exists {
val := q.Value()
rc.PidsLimit = &val
}
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc
}
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.capacity)
}
func (cm *containerManagerImpl) getNodeAllocatableAbsoluteImpl(capacity v1.ResourceList) v1.ResourceList {
result := make(v1.ResourceList)
for k, v := range capacity {
value := v.DeepCopy()
if cm.NodeConfig.SystemReserved != nil {
value.Sub(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Sub(cm.NodeConfig.KubeReserved[k])
}
if value.Sign() < 0 {
// Negative Allocatable resources don't make sense.
value.Set(0)
}
result[k] = value
}
return result
}
// getNodeAllocatableInternalAbsolute is similar to getNodeAllocatableAbsolute except that
// it also includes internal resources (currently process IDs). It is intended for setting
// up top level cgroups only.
func (cm *containerManagerImpl) getNodeAllocatableInternalAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.internalCapacity)
}
// GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling.
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.NodeConfig.SystemReserved != nil {
value.Add(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Add(cm.NodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
var errors []string
nar := cm.GetNodeAllocatableReservation()
for k, v := range nar {
value := cm.capacity[k].DeepCopy()
value.Sub(v)
if value.Sign() < 0 {
errors = append(errors, fmt.Sprintf("Resource %q has a reservation of %v but capacity of %v. Expected capacity >= reservation.", k, v, cm.capacity[k]))
}
}
if len(errors) > 0 {
return fmt.Errorf("invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
}
return nil
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
func nodeRefFromNode(nodeName string) *v1.ObjectReference {
return &v1.ObjectReference{
Kind: "Node",
Name: nodeName,
UID: types.UID(nodeName),
Namespace: "",
}
}

View File

@ -0,0 +1,355 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
"fmt"
"os"
"path"
"strings"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
podCgroupNamePrefix = "pod"
)
// podContainerManagerImpl implements podContainerManager interface.
// It is the general implementation which allows pod level container
// management if qos Cgroup is enabled.
type podContainerManagerImpl struct {
// qosContainersInfo hold absolute paths of the top level qos containers
qosContainersInfo QOSContainersInfo
// Stores the mounted cgroup subsystems
subsystems *CgroupSubsystems
// cgroupManager is the cgroup Manager Object responsible for managing all
// pod cgroups.
cgroupManager CgroupManager
// Maximum number of pids in a pod
podPidsLimit int64
// enforceCPULimits controls whether cfs quota is enforced or not
enforceCPULimits bool
// cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
// node for all containers in usec
cpuCFSQuotaPeriod uint64
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerImpl{}
// Exists checks if the pod's cgroup already exists
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
podContainerName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.Exists(podContainerName)
}
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod level container doesn't already exist it is created.
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
enforceMemoryQoS := false
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
// Create the pod container
podContainerName, _ := m.GetPodContainerName(pod)
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
}
if m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
}
if enforceMemoryQoS {
klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified)
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
}
}
return nil
}
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
podQOS := v1qos.GetPodQOS(pod)
// Get the parent QOS container name
var parentContainer CgroupName
switch podQOS {
case v1.PodQOSGuaranteed:
parentContainer = m.qosContainersInfo.Guaranteed
case v1.PodQOSBurstable:
parentContainer = m.qosContainersInfo.Burstable
case v1.PodQOSBestEffort:
parentContainer = m.qosContainersInfo.BestEffort
}
podContainer := GetPodCgroupNameSuffix(pod.UID)
// Get the absolute path of the cgroup
cgroupName := NewCgroupName(parentContainer, podContainer)
// Get the literal cgroupfs name
cgroupfsName := m.cgroupManager.Name(cgroupName)
return cgroupName, cgroupfsName
}
func (m *podContainerManagerImpl) GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error) {
podCgroupName, _ := m.GetPodContainerName(pod)
memUsage, err := m.cgroupManager.MemoryUsage(podCgroupName)
if err != nil {
return 0, err
}
return uint64(memUsage), nil
}
func (m *podContainerManagerImpl) GetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName) (*ResourceConfig, error) {
podCgroupName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.GetCgroupConfig(podCgroupName, resource)
}
func (m *podContainerManagerImpl) SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error {
podCgroupName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.SetCgroupConfig(podCgroupName, resourceConfig)
}
// Kill one process ID
func (m *podContainerManagerImpl) killOnePid(pid int) error {
// os.FindProcess never returns an error on POSIX
// https://go-review.googlesource.com/c/go/+/19093
p, _ := os.FindProcess(pid)
if err := p.Kill(); err != nil {
// If the process already exited, that's fine.
if errors.Is(err, os.ErrProcessDone) {
klog.V(3).InfoS("Process no longer exists", "pid", pid)
return nil
}
return err
}
return nil
}
// Scan through the whole cgroup directory and kill all processes either
// attached to the pod cgroup or to a container cgroup under the pod cgroup
func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
pidsToKill := m.cgroupManager.Pids(podCgroup)
// No pids charged to the terminated pod cgroup return
if len(pidsToKill) == 0 {
return nil
}
var errlist []error
// os.Kill often errors out,
// We try killing all the pids multiple times
removed := map[int]bool{}
for i := 0; i < 5; i++ {
if i != 0 {
klog.V(3).InfoS("Attempt failed to kill all unwanted process from cgroup, retrying", "attempt", i, "cgroupName", podCgroup)
}
errlist = []error{}
for _, pid := range pidsToKill {
if _, ok := removed[pid]; ok {
continue
}
klog.V(3).InfoS("Attempting to kill process from cgroup", "pid", pid, "cgroupName", podCgroup)
if err := m.killOnePid(pid); err != nil {
klog.V(3).InfoS("Failed to kill process from cgroup", "pid", pid, "cgroupName", podCgroup, "err", err)
errlist = append(errlist, err)
} else {
removed[pid] = true
}
}
if len(errlist) == 0 {
klog.V(3).InfoS("Successfully killed all unwanted processes from cgroup", "cgroupName", podCgroup)
return nil
}
}
return utilerrors.NewAggregate(errlist)
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
// Try killing all the processes attached to the pod cgroup
if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
klog.InfoS("Failed to kill all the processes attached to cgroup", "cgroupName", podCgroup, "err", err)
return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
}
// Now its safe to remove the pod's cgroup
containerConfig := &CgroupConfig{
Name: podCgroup,
ResourceParameters: &ResourceConfig{},
}
if err := m.cgroupManager.Destroy(containerConfig); err != nil {
klog.InfoS("Failed to delete cgroup paths", "cgroupName", podCgroup, "err", err)
return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
}
return nil
}
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
return m.cgroupManager.ReduceCPULimits(podCgroup)
}
// IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) {
// convert the literal cgroupfs form to the driver specific value
cgroupName := m.cgroupManager.CgroupName(cgroupfs)
qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
basePath := ""
for _, qosContainerName := range qosContainersList {
// a pod cgroup is a direct child of a qos node, so check if its a match
if len(cgroupName) == len(qosContainerName)+1 {
basePath = cgroupName[len(qosContainerName)]
}
}
if basePath == "" {
return false, types.UID("")
}
if !strings.HasPrefix(basePath, podCgroupNamePrefix) {
return false, types.UID("")
}
parts := strings.Split(basePath, podCgroupNamePrefix)
if len(parts) != 2 {
return false, types.UID("")
}
return true, types.UID(parts[1])
}
// GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
// Get list of pods whose cgroup still exist on the cgroup mounts
func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
// Map for storing all the found pods on the disk
foundPods := make(map[types.UID]CgroupName)
qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
// Scan through all the subsystem mounts
// and through each QoS cgroup directory for each subsystem mount
// If a pod cgroup exists in even a single subsystem mount
// we will attempt to delete it
for _, val := range m.subsystems.MountPoints {
for _, qosContainerName := range qosContainersList {
// get the subsystems QoS cgroup absolute name
qcConversion := m.cgroupManager.Name(qosContainerName)
qc := path.Join(val, qcConversion)
dirInfo, err := os.ReadDir(qc)
if err != nil {
if os.IsNotExist(err) {
continue
}
return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
}
for i := range dirInfo {
// its not a directory, so continue on...
if !dirInfo[i].IsDir() {
continue
}
// convert the concrete cgroupfs name back to an internal identifier
// this is needed to handle path conversion for systemd environments.
// we pass the fully qualified path so decoding can work as expected
// since systemd encodes the path in each segment.
cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
// we only care about base segment of the converted path since that
// is what we are reading currently to know if it is a pod or not.
basePath := internalPath[len(internalPath)-1]
if !strings.Contains(basePath, podCgroupNamePrefix) {
continue
}
// we then split the name on the pod prefix to determine the uid
parts := strings.Split(basePath, podCgroupNamePrefix)
// the uid is missing, so we log the unexpected cgroup not of form pod<uid>
if len(parts) != 2 {
klog.InfoS("Pod cgroup manager ignored unexpected cgroup because it is not a pod", "path", cgroupfsPath)
continue
}
podUID := parts[1]
foundPods[types.UID(podUID)] = internalPath
}
}
}
return foundPods, nil
}
// podContainerManagerNoop implements podContainerManager interface.
// It is a no-op implementation and basically does nothing
// podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
// enabled, so Exists() returns true always as the cgroupRoot
// is expected to always exist.
type podContainerManagerNoop struct {
cgroupRoot CgroupName
}
// Make sure that podContainerManagerStub implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerNoop{}
func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return m.cgroupRoot, ""
}
func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
return ""
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) {
return false, types.UID("")
}
func (m *podContainerManagerNoop) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) {
return 0, nil
}
func (m *podContainerManagerNoop) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
return nil, nil
}
func (m *podContainerManagerNoop) SetPodCgroupConfig(_ *v1.Pod, _ *ResourceConfig) error {
return nil
}

View File

@ -0,0 +1,75 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type podContainerManagerStub struct {
}
var _ PodContainerManager = &podContainerManagerStub{}
func (m *podContainerManagerStub) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerStub) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerStub) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return nil, ""
}
func (m *podContainerManagerStub) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *podContainerManagerStub) IsPodCgroup(cgroupfs string) (bool, types.UID) {
return false, types.UID("")
}
func (m *podContainerManagerStub) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) {
return 0, nil
}
func (m *podContainerManagerStub) GetPodCgroupMemoryLimit(_ *v1.Pod) (uint64, error) {
return 0, nil
}
func (m *podContainerManagerStub) GetPodCgroupCpuLimit(_ *v1.Pod) (int64, uint64, uint64, error) {
return 0, 0, 0, nil
}
func (m *podContainerManagerStub) SetPodCgroupMemoryLimit(_ *v1.Pod, _ int64) error {
return nil
}
func (m *podContainerManagerStub) SetPodCgroupCpuLimit(_ *v1.Pod, _ *int64, _, _ *uint64) error {
return nil
}

View File

@ -0,0 +1,407 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"strconv"
"strings"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
// how often the qos cgroup manager will perform periodic update
// of the qos level cgroup resource constraints
periodicQOSCgroupUpdateInterval = 1 * time.Minute
)
type QOSContainerManager interface {
Start(func() v1.ResourceList, ActivePodsFunc) error
GetQOSContainersInfo() QOSContainersInfo
UpdateCgroups() error
}
type qosContainerManagerImpl struct {
sync.Mutex
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot CgroupName
qosReserved map[v1.ResourceName]int64
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
if !nodeConfig.CgroupsPerQOS {
return &qosContainerManagerNoop{
cgroupRoot: cgroupRoot,
}, nil
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: cgroupManager,
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.QOSReserved,
}, nil
}
func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return m.qosContainersInfo
}
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := m.cgroupRoot
if err := cm.Validate(rootContainer); err != nil {
return fmt.Errorf("error validating root container %v : %w", rootContainer, err)
}
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := map[v1.PodQOSClass]CgroupName{
v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
}
// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
resourceParameters := &ResourceConfig{}
// the BestEffort QoS class has a statically configured minShares value
if qosClass == v1.PodQOSBestEffort {
minShares := uint64(MinShares)
resourceParameters.CPUShares = &minShares
}
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: containerName,
ResourceParameters: resourceParameters,
}
// for each enumerated huge page size, the qos tiers are unbounded
m.setHugePagesUnbounded(containerConfig)
// check if it exists
if !cm.Exists(containerName) {
if err := cm.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
}
} else {
// to ensure we actually have the right state, we update the config on startup
if err := cm.Update(containerConfig); err != nil {
return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
}
}
}
// Store the top level qos container names
m.qosContainersInfo = QOSContainersInfo{
Guaranteed: rootContainer,
Burstable: qosClasses[v1.PodQOSBurstable],
BestEffort: qosClasses[v1.PodQOSBestEffort],
}
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods
// update qos cgroup tiers on startup and in periodic intervals
// to ensure desired state is in sync with actual state.
go wait.Until(func() {
err := m.UpdateCgroups()
if err != nil {
klog.InfoS("Failed to reserve QoS requests", "err", err)
}
}, periodicQOSCgroupUpdateInterval, wait.NeverStop)
return nil
}
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
hugePageLimit := map[int64]int64{}
for _, pageSize := range libcontainercgroups.HugePageSizes() {
pageSizeBytes, err := units.RAMInBytes(pageSize)
if err != nil {
return err
}
hugePageLimit[pageSizeBytes] = int64(1 << 62)
}
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
return nil
}
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
for _, v := range configs {
if err := m.setHugePagesUnbounded(v); err != nil {
return err
}
}
return nil
}
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
reuseReqs := make(v1.ResourceList, 4)
for i := range pods {
pod := pods[i]
qosClass := v1qos.GetPodQOS(pod)
if qosClass != v1.PodQOSBurstable {
// we only care about the burstable qos tier
continue
}
req := resource.PodRequests(pod, resource.PodResourcesOptions{
Reuse: reuseReqs,
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodLevelResources),
})
if request, found := req[v1.ResourceCPU]; found {
burstablePodCPURequest += request.MilliValue()
}
}
// make sure best effort is always 2 shares
bestEffortCPUShares := uint64(MinShares)
configs[v1.PodQOSBestEffort].ResourceParameters.CPUShares = &bestEffortCPUShares
// set burstable shares based on current observe state
burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
configs[v1.PodQOSBurstable].ResourceParameters.CPUShares = &burstableCPUShares
return nil
}
// getQoSMemoryRequests sums and returns the memory request of all pods for
// guaranteed and burstable qos classes.
func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 {
qosMemoryRequests := map[v1.PodQOSClass]int64{
v1.PodQOSGuaranteed: 0,
v1.PodQOSBurstable: 0,
}
// Sum the pod limits for pods in each QOS class
pods := m.activePods()
reuseReqs := make(v1.ResourceList, 4)
for _, pod := range pods {
podMemoryRequest := int64(0)
qosClass := v1qos.GetPodQOS(pod)
if qosClass == v1.PodQOSBestEffort {
// limits are not set for Best Effort pods
continue
}
req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs})
if request, found := req[v1.ResourceMemory]; found {
podMemoryRequest += request.Value()
}
qosMemoryRequests[qosClass] += podMemoryRequest
}
return qosMemoryRequests
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
qosMemoryRequests := m.getQoSMemoryRequests()
resources := m.getNodeAllocatable()
allocatableResource, ok := resources[v1.ResourceMemory]
if !ok {
klog.V(2).InfoS("Allocatable memory value could not be determined, not setting QoS memory limits")
return
}
allocatable := allocatableResource.Value()
if allocatable == 0 {
klog.V(2).InfoS("Allocatable memory reported as 0, might be in standalone mode, not setting QoS memory limits")
return
}
for qos, limits := range qosMemoryRequests {
klog.V(2).InfoS("QoS pod memory limit", "qos", qos, "limits", limits, "percentReserve", percentReserve)
}
// Calculate QOS memory limits
burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
}
// retrySetMemoryReserve checks for any QoS cgroups over the limit
// that was attempted to be set in the first Update() and adjusts
// their memory limit to the usage to prevent further growth.
func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
// Unreclaimable memory usage may already exceeded the desired limit
// Attempt to set the limit near the current usage to put pressure
// on the cgroup and prevent further growth.
for qos, config := range configs {
usage, err := m.cgroupManager.MemoryUsage(config.Name)
if err != nil {
klog.V(2).InfoS("Failed to get resource stats", "err", err)
return
}
// Because there is no good way to determine of the original Update()
// on the memory resource was successful, we determine failure of the
// first attempt by checking if the usage is above the limit we attempt
// to set. If it is, we assume the first attempt to set the limit failed
// and try again setting the limit to the usage. Otherwise we leave
// the CgroupConfig as is.
if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
configs[qos].ResourceParameters.Memory = &usage
}
}
}
// setMemoryQoS sums the memory requests of all pods in the Burstable class,
// and set the sum memory as the memory.min in the Unified field of CgroupConfig.
func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) {
qosMemoryRequests := m.getQoSMemoryRequests()
// Calculate the memory.min:
// for burstable(/kubepods/burstable): sum of all burstable pods
// for guaranteed(/kubepods): sum of all guaranteed and burstable pods
burstableMin := qosMemoryRequests[v1.PodQOSBurstable]
guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin
if burstableMin > 0 {
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
}
if guaranteedMin > 0 {
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
}
}
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSGuaranteed: {
Name: m.qosContainersInfo.Guaranteed,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
// update the qos level cgroup settings for cpu shares
if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
return err
}
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
// update the qos level cgrougs v2 settings of memory qos if feature enabled
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
m.setMemoryQoS(qosConfigs)
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
klog.V(4).InfoS("Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
}
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
klog.ErrorS(err, "Failed to update QoS cgroup configuration")
return err
}
}
klog.V(4).InfoS("Updated QoS cgroup configuration")
return nil
}
type qosContainerManagerNoop struct {
cgroupRoot CgroupName
}
var _ QOSContainerManager = &qosContainerManagerNoop{}
func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
return nil
}
func (m *qosContainerManagerNoop) UpdateCgroups() error {
return nil
}

View File

@ -0,0 +1,25 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package resourceupdates
// Update is a struct that represents an update to a pod when
// the resource changes it's status.
// Later we may need to add fields like container name, resource name, and a new status.
type Update struct {
// PodUID is the UID of the pod which status needs to be updated.
PodUIDs []string
}

View File

@ -0,0 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- derekwaynecarr
- klueska
reviewers: []
emeritus_approvers:
- ConnorDoyle
- lmdaly

View File

@ -0,0 +1,222 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package bitmask
import (
"fmt"
"math/bits"
"strconv"
)
// BitMask interface allows hint providers to create BitMasks for TopologyHints
type BitMask interface {
Add(bits ...int) error
Remove(bits ...int) error
And(masks ...BitMask)
Or(masks ...BitMask)
Clear()
Fill()
IsEqual(mask BitMask) bool
IsEmpty() bool
IsSet(bit int) bool
AnySet(bits []int) bool
IsNarrowerThan(mask BitMask) bool
IsLessThan(mask BitMask) bool
IsGreaterThan(mask BitMask) bool
String() string
Count() int
GetBits() []int
}
type bitMask uint64
// NewEmptyBitMask creates a new, empty BitMask
func NewEmptyBitMask() BitMask {
s := bitMask(0)
return &s
}
// NewBitMask creates a new BitMask
func NewBitMask(bits ...int) (BitMask, error) {
s := bitMask(0)
err := (&s).Add(bits...)
if err != nil {
return nil, err
}
return &s, nil
}
// Add adds the bits with topology affinity to the BitMask
func (s *bitMask) Add(bits ...int) error {
mask := *s
for _, i := range bits {
if i < 0 || i >= 64 {
return fmt.Errorf("bit number must be in range 0-63")
}
mask |= 1 << uint64(i)
}
*s = mask
return nil
}
// Remove removes specified bits from BitMask
func (s *bitMask) Remove(bits ...int) error {
mask := *s
for _, i := range bits {
if i < 0 || i >= 64 {
return fmt.Errorf("bit number must be in range 0-63")
}
mask &^= 1 << uint64(i)
}
*s = mask
return nil
}
// And performs and operation on all bits in masks
func (s *bitMask) And(masks ...BitMask) {
for _, m := range masks {
*s &= *m.(*bitMask)
}
}
// Or performs or operation on all bits in masks
func (s *bitMask) Or(masks ...BitMask) {
for _, m := range masks {
*s |= *m.(*bitMask)
}
}
// Clear resets all bits in mask to zero
func (s *bitMask) Clear() {
*s = 0
}
// Fill sets all bits in mask to one
func (s *bitMask) Fill() {
*s = bitMask(^uint64(0))
}
// IsEmpty checks mask to see if all bits are zero
func (s *bitMask) IsEmpty() bool {
return *s == 0
}
// IsSet checks bit in mask to see if bit is set to one
func (s *bitMask) IsSet(bit int) bool {
if bit < 0 || bit >= 64 {
return false
}
return (*s & (1 << uint64(bit))) > 0
}
// AnySet checks bit in mask to see if any provided bit is set to one
func (s *bitMask) AnySet(bits []int) bool {
for _, b := range bits {
if s.IsSet(b) {
return true
}
}
return false
}
// IsEqual checks if masks are equal
func (s *bitMask) IsEqual(mask BitMask) bool {
return *s == *mask.(*bitMask)
}
// IsNarrowerThan checks if one mask is narrower than another.
//
// A mask is said to be "narrower" than another if it has lets bits set. If the
// same number of bits are set in both masks, then the mask with more
// lower-numbered bits set wins out.
func (s *bitMask) IsNarrowerThan(mask BitMask) bool {
if s.Count() == mask.Count() {
return s.IsLessThan(mask)
}
return s.Count() < mask.Count()
}
// IsLessThan checks which bitmask has more lower-numbered bits set.
func (s *bitMask) IsLessThan(mask BitMask) bool {
return *s < *mask.(*bitMask)
}
// IsGreaterThan checks which bitmask has more higher-numbered bits set.
func (s *bitMask) IsGreaterThan(mask BitMask) bool {
return *s > *mask.(*bitMask)
}
// String converts mask to string
func (s *bitMask) String() string {
grouping := 2
for shift := 64 - grouping; shift > 0; shift -= grouping {
if *s > (1 << uint(shift)) {
return fmt.Sprintf("%0"+strconv.Itoa(shift+grouping)+"b", *s)
}
}
return fmt.Sprintf("%0"+strconv.Itoa(grouping)+"b", *s)
}
// Count counts number of bits in mask set to one
func (s *bitMask) Count() int {
return bits.OnesCount64(uint64(*s))
}
// Getbits returns each bit number with bits set to one
func (s *bitMask) GetBits() []int {
var bits []int
for i := uint64(0); i < 64; i++ {
if (*s & (1 << i)) > 0 {
bits = append(bits, int(i))
}
}
return bits
}
// And is a package level implementation of 'and' between first and masks
func And(first BitMask, masks ...BitMask) BitMask {
s := *first.(*bitMask)
s.And(masks...)
return &s
}
// Or is a package level implementation of 'or' between first and masks
func Or(first BitMask, masks ...BitMask) BitMask {
s := *first.(*bitMask)
s.Or(masks...)
return &s
}
// IterateBitMasks iterates all possible masks from a list of bits,
// issuing a callback on each mask.
func IterateBitMasks(bits []int, callback func(BitMask)) {
var iterate func(bits, accum []int, size int)
iterate = func(bits, accum []int, size int) {
if len(accum) == size {
mask, _ := NewBitMask(accum...)
callback(mask)
return
}
for i := range bits {
iterate(bits[i+1:], append(accum, bits[i]), size)
}
}
for i := 1; i <= len(bits); i++ {
iterate(bits, []int{}, i)
}
}

View File

@ -0,0 +1,83 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
type fakeManager struct {
hint *TopologyHint
policy Policy
}
// NewFakeManager returns an instance of FakeManager
func NewFakeManager() Manager {
klog.InfoS("NewFakeManager")
return &fakeManager{}
}
// NewFakeManagerWithHint returns an instance of fake topology manager with specified topology hints
func NewFakeManagerWithHint(hint *TopologyHint) Manager {
klog.InfoS("NewFakeManagerWithHint")
return &fakeManager{
hint: hint,
policy: NewNonePolicy(),
}
}
// NewFakeManagerWithPolicy returns an instance of fake topology manager with specified policy
func NewFakeManagerWithPolicy(policy Policy) Manager {
klog.InfoS("NewFakeManagerWithPolicy")
return &fakeManager{
policy: policy,
}
}
func (m *fakeManager) GetAffinity(podUID string, containerName string) TopologyHint {
klog.InfoS("GetAffinity", "podUID", podUID, "containerName", containerName)
if m.hint == nil {
return TopologyHint{}
}
return *m.hint
}
func (m *fakeManager) GetPolicy() Policy {
return m.policy
}
func (m *fakeManager) AddHintProvider(h HintProvider) {
klog.InfoS("AddHintProvider", "hintProvider", h)
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
klog.InfoS("AddContainer", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID)
}
func (m *fakeManager) RemoveContainer(containerID string) error {
klog.InfoS("RemoveContainer", "containerID", containerID)
return nil
}
func (m *fakeManager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
klog.InfoS("Topology Admit Handler")
return admission.GetPodAdmitResult(nil)
}

View File

@ -0,0 +1,109 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"fmt"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
type NUMADistances map[int][]uint64
type NUMAInfo struct {
Nodes []int
NUMADistances NUMADistances
}
func NewNUMAInfo(topology []cadvisorapi.Node, opts PolicyOptions) (*NUMAInfo, error) {
var numaNodes []int
distances := map[int][]uint64{}
for _, node := range topology {
numaNodes = append(numaNodes, node.Id)
var nodeDistance []uint64
if opts.PreferClosestNUMA {
nodeDistance = node.Distances
if nodeDistance == nil {
return nil, fmt.Errorf("error getting NUMA distances from cadvisor")
}
}
distances[node.Id] = nodeDistance
}
numaInfo := &NUMAInfo{
Nodes: numaNodes,
NUMADistances: distances,
}
return numaInfo, nil
}
func (n *NUMAInfo) Narrowest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask {
if m1.IsNarrowerThan(m2) {
return m1
}
return m2
}
func (n *NUMAInfo) Closest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask {
// If the length of both bitmasks aren't the same, choose the one that is narrowest.
if m1.Count() != m2.Count() {
return n.Narrowest(m1, m2)
}
m1Distance := n.NUMADistances.CalculateAverageFor(m1)
m2Distance := n.NUMADistances.CalculateAverageFor(m2)
// If average distance is the same, take bitmask with more lower-number bits set.
if m1Distance == m2Distance {
if m1.IsLessThan(m2) {
return m1
}
return m2
}
// Otherwise, return the bitmask with the shortest average distance between NUMA nodes.
if m1Distance < m2Distance {
return m1
}
return m2
}
func (n NUMAInfo) DefaultAffinityMask() bitmask.BitMask {
defaultAffinity, _ := bitmask.NewBitMask(n.Nodes...)
return defaultAffinity
}
func (d NUMADistances) CalculateAverageFor(bm bitmask.BitMask) float64 {
// This should never happen, but just in case make sure we do not divide by zero.
if bm.Count() == 0 {
return 0
}
var count float64 = 0
var sum float64 = 0
for _, node1 := range bm.GetBits() {
for _, node2 := range bm.GetBits() {
sum += float64(d[node1][node2])
count++
}
}
return sum / count
}

View File

@ -0,0 +1,361 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
// Policy interface for Topology Manager Pod Admit Result
type Policy interface {
// Returns Policy Name
Name() string
// Returns a merged TopologyHint based on input from hint providers
// and a Pod Admit Handler Response based on hints and policy type
Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
}
// IsAlignmentGuaranteed return true if the given policy guarantees that either
// the compute resources will be allocated within a NUMA boundary, or the allocation will fail at all.
func IsAlignmentGuaranteed(p Policy) bool {
// We are abusing the name, but atm this matches almost 1:1 the policy name
// so we are not adding new fields for now.
return p.Name() == PolicySingleNumaNode
}
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
// of their affinity masks. The hint shall be preferred if all hits in the permutation
// are preferred.
func mergePermutation(defaultAffinity bitmask.BitMask, permutation []TopologyHint) TopologyHint {
// Get the NUMANodeAffinity from each hint in the permutation and see if any
// of them encode unpreferred allocations.
preferred := true
var numaAffinities []bitmask.BitMask
for _, hint := range permutation {
// Only consider hints that have an actual NUMANodeAffinity set.
if hint.NUMANodeAffinity != nil {
numaAffinities = append(numaAffinities, hint.NUMANodeAffinity)
// Only mark preferred if all affinities are equal.
if !hint.NUMANodeAffinity.IsEqual(numaAffinities[0]) {
preferred = false
}
}
// Only mark preferred if all affinities are preferred.
if !hint.Preferred {
preferred = false
}
}
// Merge the affinities using a bitwise-and operation.
mergedAffinity := bitmask.And(defaultAffinity, numaAffinities...)
// Build a mergedHint from the merged affinity mask, setting preferred as
// appropriate based on the logic above.
return TopologyHint{mergedAffinity, preferred}
}
func filterProvidersHints(providersHints []map[string][]TopologyHint) [][]TopologyHint {
// Loop through all hint providers and save an accumulated list of the
// hints returned by each hint provider. If no hints are provided, assume
// that provider has no preference for topology-aware allocation.
var allProviderHints [][]TopologyHint
for _, hints := range providersHints {
// If hints is nil, insert a single, preferred any-numa hint into allProviderHints.
if len(hints) == 0 {
klog.InfoS("Hint Provider has no preference for NUMA affinity with any resource")
allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
continue
}
// Otherwise, accumulate the hints for each resource type into allProviderHints.
for resource := range hints {
if hints[resource] == nil {
klog.InfoS("Hint Provider has no preference for NUMA affinity with resource", "resource", resource)
allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
continue
}
if len(hints[resource]) == 0 {
klog.InfoS("Hint Provider has no possible NUMA affinities for resource", "resource", resource)
allProviderHints = append(allProviderHints, []TopologyHint{{nil, false}})
continue
}
allProviderHints = append(allProviderHints, hints[resource])
}
}
return allProviderHints
}
func narrowestHint(hints []TopologyHint) *TopologyHint {
if len(hints) == 0 {
return nil
}
var narrowestHint *TopologyHint
for i := range hints {
if hints[i].NUMANodeAffinity == nil {
continue
}
if narrowestHint == nil {
narrowestHint = &hints[i]
}
if hints[i].NUMANodeAffinity.IsNarrowerThan(narrowestHint.NUMANodeAffinity) {
narrowestHint = &hints[i]
}
}
return narrowestHint
}
func maxOfMinAffinityCounts(filteredHints [][]TopologyHint) int {
maxOfMinCount := 0
for _, resourceHints := range filteredHints {
narrowestHint := narrowestHint(resourceHints)
if narrowestHint == nil {
continue
}
if narrowestHint.NUMANodeAffinity.Count() > maxOfMinCount {
maxOfMinCount = narrowestHint.NUMANodeAffinity.Count()
}
}
return maxOfMinCount
}
type HintMerger struct {
NUMAInfo *NUMAInfo
Hints [][]TopologyHint
// Set bestNonPreferredAffinityCount to help decide which affinity mask is
// preferred amongst all non-preferred hints. We calculate this value as
// the maximum of the minimum affinity counts supplied for any given hint
// provider. In other words, prefer a hint that has an affinity mask that
// includes all of the NUMA nodes from the provider that requires the most
// NUMA nodes to satisfy its allocation.
BestNonPreferredAffinityCount int
CompareNUMAAffinityMasks func(candidate *TopologyHint, current *TopologyHint) (best *TopologyHint)
}
func NewHintMerger(numaInfo *NUMAInfo, hints [][]TopologyHint, policyName string, opts PolicyOptions) HintMerger {
compareNumaAffinityMasks := func(current, candidate *TopologyHint) *TopologyHint {
// If current and candidate bitmasks are the same, prefer current hint.
if candidate.NUMANodeAffinity.IsEqual(current.NUMANodeAffinity) {
return current
}
// Otherwise compare the hints, based on the policy options provided
var best bitmask.BitMask
if (policyName != PolicySingleNumaNode) && opts.PreferClosestNUMA {
best = numaInfo.Closest(current.NUMANodeAffinity, candidate.NUMANodeAffinity)
} else {
best = numaInfo.Narrowest(current.NUMANodeAffinity, candidate.NUMANodeAffinity)
}
if best.IsEqual(current.NUMANodeAffinity) {
return current
}
return candidate
}
merger := HintMerger{
NUMAInfo: numaInfo,
Hints: hints,
BestNonPreferredAffinityCount: maxOfMinAffinityCounts(hints),
CompareNUMAAffinityMasks: compareNumaAffinityMasks,
}
return merger
}
func (m HintMerger) compare(current *TopologyHint, candidate *TopologyHint) *TopologyHint {
// Only consider candidates that result in a NUMANodeAffinity > 0 to
// replace the current bestHint.
if candidate.NUMANodeAffinity.Count() == 0 {
return current
}
// If no current bestHint is set, return the candidate as the bestHint.
if current == nil {
return candidate
}
// If the current bestHint is non-preferred and the candidate hint is
// preferred, always choose the preferred hint over the non-preferred one.
if !current.Preferred && candidate.Preferred {
return candidate
}
// If the current bestHint is preferred and the candidate hint is
// non-preferred, never update the bestHint, regardless of how
// the candidate hint's affinity mask compares to the current
// hint's affinity mask.
if current.Preferred && !candidate.Preferred {
return current
}
// If the current bestHint and the candidate hint are both preferred,
// then only consider fitter NUMANodeAffinity
if current.Preferred && candidate.Preferred {
return m.CompareNUMAAffinityMasks(current, candidate)
}
// The only case left is if the current best bestHint and the candidate
// hint are both non-preferred. In this case, try and find a hint whose
// affinity count is as close to (but not higher than) the
// bestNonPreferredAffinityCount as possible. To do this we need to
// consider the following cases and react accordingly:
//
// 1. current.NUMANodeAffinity.Count() > bestNonPreferredAffinityCount
// 2. current.NUMANodeAffinity.Count() == bestNonPreferredAffinityCount
// 3. current.NUMANodeAffinity.Count() < bestNonPreferredAffinityCount
//
// For case (1), the current bestHint is larger than the
// bestNonPreferredAffinityCount, so updating to fitter mergeHint
// is preferred over staying where we are.
//
// For case (2), the current bestHint is equal to the
// bestNonPreferredAffinityCount, so we would like to stick with what
// we have *unless* the candidate hint is also equal to
// bestNonPreferredAffinityCount and it is fitter.
//
// For case (3), the current bestHint is less than
// bestNonPreferredAffinityCount, so we would like to creep back up to
// bestNonPreferredAffinityCount as close as we can. There are three
// cases to consider here:
//
// 3a. candidate.NUMANodeAffinity.Count() > bestNonPreferredAffinityCount
// 3b. candidate.NUMANodeAffinity.Count() == bestNonPreferredAffinityCount
// 3c. candidate.NUMANodeAffinity.Count() < bestNonPreferredAffinityCount
//
// For case (3a), we just want to stick with the current bestHint
// because choosing a new hint that is greater than
// bestNonPreferredAffinityCount would be counter-productive.
//
// For case (3b), we want to immediately update bestHint to the
// candidate hint, making it now equal to bestNonPreferredAffinityCount.
//
// For case (3c), we know that *both* the current bestHint and the
// candidate hint are less than bestNonPreferredAffinityCount, so we
// want to choose one that brings us back up as close to
// bestNonPreferredAffinityCount as possible. There are three cases to
// consider here:
//
// 3ca. candidate.NUMANodeAffinity.Count() > current.NUMANodeAffinity.Count()
// 3cb. candidate.NUMANodeAffinity.Count() < current.NUMANodeAffinity.Count()
// 3cc. candidate.NUMANodeAffinity.Count() == current.NUMANodeAffinity.Count()
//
// For case (3ca), we want to immediately update bestHint to the
// candidate hint because that will bring us closer to the (higher)
// value of bestNonPreferredAffinityCount.
//
// For case (3cb), we want to stick with the current bestHint because
// choosing the candidate hint would strictly move us further away from
// the bestNonPreferredAffinityCount.
//
// Finally, for case (3cc), we know that the current bestHint and the
// candidate hint are equal, so we simply choose the fitter of the 2.
// Case 1
if current.NUMANodeAffinity.Count() > m.BestNonPreferredAffinityCount {
return m.CompareNUMAAffinityMasks(current, candidate)
}
// Case 2
if current.NUMANodeAffinity.Count() == m.BestNonPreferredAffinityCount {
if candidate.NUMANodeAffinity.Count() != m.BestNonPreferredAffinityCount {
return current
}
return m.CompareNUMAAffinityMasks(current, candidate)
}
// Case 3a
if candidate.NUMANodeAffinity.Count() > m.BestNonPreferredAffinityCount {
return current
}
// Case 3b
if candidate.NUMANodeAffinity.Count() == m.BestNonPreferredAffinityCount {
return candidate
}
// Case 3ca
if candidate.NUMANodeAffinity.Count() > current.NUMANodeAffinity.Count() {
return candidate
}
// Case 3cb
if candidate.NUMANodeAffinity.Count() < current.NUMANodeAffinity.Count() {
return current
}
// Case 3cc
return m.CompareNUMAAffinityMasks(current, candidate)
}
func (m HintMerger) Merge() TopologyHint {
defaultAffinity := m.NUMAInfo.DefaultAffinityMask()
var bestHint *TopologyHint
iterateAllProviderTopologyHints(m.Hints, func(permutation []TopologyHint) {
// Get the NUMANodeAffinity from each hint in the permutation and see if any
// of them encode unpreferred allocations.
mergedHint := mergePermutation(defaultAffinity, permutation)
// Compare the current bestHint with the candidate mergedHint and
// update bestHint if appropriate.
bestHint = m.compare(bestHint, &mergedHint)
})
if bestHint == nil {
bestHint = &TopologyHint{defaultAffinity, false}
}
return *bestHint
}
// Iterate over all permutations of hints in 'allProviderHints [][]TopologyHint'.
//
// This procedure is implemented as a recursive function over the set of hints
// in 'allproviderHints[i]'. It applies the function 'callback' to each
// permutation as it is found. It is the equivalent of:
//
// for i := 0; i < len(providerHints[0]); i++
//
// for j := 0; j < len(providerHints[1]); j++
// for k := 0; k < len(providerHints[2]); k++
// ...
// for z := 0; z < len(providerHints[-1]); z++
// permutation := []TopologyHint{
// providerHints[0][i],
// providerHints[1][j],
// providerHints[2][k],
// ...
// providerHints[-1][z]
// }
// callback(permutation)
func iterateAllProviderTopologyHints(allProviderHints [][]TopologyHint, callback func([]TopologyHint)) {
// Internal helper function to accumulate the permutation before calling the callback.
var iterate func(i int, accum []TopologyHint)
iterate = func(i int, accum []TopologyHint) {
// Base case: we have looped through all providers and have a full permutation.
if i == len(allProviderHints) {
callback(accum)
return
}
// Loop through all hints for provider 'i', and recurse to build the
// permutation of this hint with all hints from providers 'i++'.
for j := range allProviderHints[i] {
iterate(i+1, append(accum, allProviderHints[i][j]))
}
}
iterate(0, []TopologyHint{})
}

View File

@ -0,0 +1,49 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
type bestEffortPolicy struct {
// numaInfo represents list of NUMA Nodes available on the underlying machine and distances between them
numaInfo *NUMAInfo
opts PolicyOptions
}
var _ Policy = &bestEffortPolicy{}
// PolicyBestEffort policy name.
const PolicyBestEffort string = "best-effort"
// NewBestEffortPolicy returns best-effort policy.
func NewBestEffortPolicy(numaInfo *NUMAInfo, opts PolicyOptions) Policy {
return &bestEffortPolicy{numaInfo: numaInfo, opts: opts}
}
func (p *bestEffortPolicy) Name() string {
return PolicyBestEffort
}
func (p *bestEffortPolicy) canAdmitPodResult(hint *TopologyHint) bool {
return true
}
func (p *bestEffortPolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredHints := filterProvidersHints(providersHints)
merger := NewHintMerger(p.numaInfo, filteredHints, p.Name(), p.opts)
bestHint := merger.Merge()
admit := p.canAdmitPodResult(&bestHint)
return bestHint, admit
}

View File

@ -0,0 +1,41 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
type nonePolicy struct{}
var _ Policy = &nonePolicy{}
// PolicyNone policy name.
const PolicyNone string = "none"
// NewNonePolicy returns none policy.
func NewNonePolicy() Policy {
return &nonePolicy{}
}
func (p *nonePolicy) Name() string {
return PolicyNone
}
func (p *nonePolicy) canAdmitPodResult(hint *TopologyHint) bool {
return true
}
func (p *nonePolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
return TopologyHint{}, p.canAdmitPodResult(nil)
}

View File

@ -0,0 +1,105 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"fmt"
"strconv"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
PreferClosestNUMANodes string = "prefer-closest-numa-nodes"
MaxAllowableNUMANodes string = "max-allowable-numa-nodes"
)
var (
alphaOptions = sets.New[string]()
betaOptions = sets.New[string](
MaxAllowableNUMANodes,
)
stableOptions = sets.New[string](
PreferClosestNUMANodes,
)
)
func CheckPolicyOptionAvailable(option string) error {
if !alphaOptions.Has(option) && !betaOptions.Has(option) && !stableOptions.Has(option) {
return fmt.Errorf("unknown Topology Manager Policy option: %q", option)
}
if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyAlphaOptions) {
return fmt.Errorf("Topology Manager Policy Alpha-level Options not enabled, but option %q provided", option)
}
if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyBetaOptions) {
return fmt.Errorf("Topology Manager Policy Beta-level Options not enabled, but option %q provided", option)
}
return nil
}
type PolicyOptions struct {
PreferClosestNUMA bool
MaxAllowableNUMANodes int
}
func NewPolicyOptions(policyOptions map[string]string) (PolicyOptions, error) {
opts := PolicyOptions{
// Set MaxAllowableNUMANodes to the default. This will be overwritten
// if the user has specified a policy option for MaxAllowableNUMANodes.
MaxAllowableNUMANodes: defaultMaxAllowableNUMANodes,
}
for name, value := range policyOptions {
if err := CheckPolicyOptionAvailable(name); err != nil {
return opts, err
}
switch name {
case PreferClosestNUMANodes:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.PreferClosestNUMA = optValue
case MaxAllowableNUMANodes:
optValue, err := strconv.Atoi(value)
if err != nil {
return opts, fmt.Errorf("unable to convert policy option to integer %q: %w", name, err)
}
if optValue < defaultMaxAllowableNUMANodes {
return opts, fmt.Errorf("the minimum value of %q should not be less than %v", name, defaultMaxAllowableNUMANodes)
}
if optValue > defaultMaxAllowableNUMANodes {
klog.InfoS("WARNING: the value of max-allowable-numa-nodes is more than the default recommended value", "max-allowable-numa-nodes", optValue, "defaultMaxAllowableNUMANodes", defaultMaxAllowableNUMANodes)
}
opts.MaxAllowableNUMANodes = optValue
default:
// this should never be reached, we already detect unknown options,
// but we keep it as further safety.
return opts, fmt.Errorf("unsupported topologymanager option: %q (%s)", name, value)
}
}
return opts, nil
}

View File

@ -0,0 +1,47 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
type restrictedPolicy struct {
bestEffortPolicy
}
var _ Policy = &restrictedPolicy{}
// PolicyRestricted policy name.
const PolicyRestricted string = "restricted"
// NewRestrictedPolicy returns restricted policy.
func NewRestrictedPolicy(numaInfo *NUMAInfo, opts PolicyOptions) Policy {
return &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo, opts: opts}}
}
func (p *restrictedPolicy) Name() string {
return PolicyRestricted
}
func (p *restrictedPolicy) canAdmitPodResult(hint *TopologyHint) bool {
return hint.Preferred
}
func (p *restrictedPolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredHints := filterProvidersHints(providersHints)
merger := NewHintMerger(p.numaInfo, filteredHints, p.Name(), p.opts)
bestHint := merger.Merge()
admit := p.canAdmitPodResult(&bestHint)
return bestHint, admit
}

View File

@ -0,0 +1,75 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
type singleNumaNodePolicy struct {
// numaInfo represents list of NUMA Nodes available on the underlying machine and distances between them
numaInfo *NUMAInfo
opts PolicyOptions
}
var _ Policy = &singleNumaNodePolicy{}
// PolicySingleNumaNode policy name.
const PolicySingleNumaNode string = "single-numa-node"
// NewSingleNumaNodePolicy returns single-numa-node policy.
func NewSingleNumaNodePolicy(numaInfo *NUMAInfo, opts PolicyOptions) Policy {
return &singleNumaNodePolicy{numaInfo: numaInfo, opts: opts}
}
func (p *singleNumaNodePolicy) Name() string {
return PolicySingleNumaNode
}
func (p *singleNumaNodePolicy) canAdmitPodResult(hint *TopologyHint) bool {
return hint.Preferred
}
// Return hints that have valid bitmasks with exactly one bit set.
func filterSingleNumaHints(allResourcesHints [][]TopologyHint) [][]TopologyHint {
var filteredResourcesHints [][]TopologyHint
for _, oneResourceHints := range allResourcesHints {
var filtered []TopologyHint
for _, hint := range oneResourceHints {
if hint.NUMANodeAffinity == nil && hint.Preferred {
filtered = append(filtered, hint)
}
if hint.NUMANodeAffinity != nil && hint.NUMANodeAffinity.Count() == 1 && hint.Preferred {
filtered = append(filtered, hint)
}
}
filteredResourcesHints = append(filteredResourcesHints, filtered)
}
return filteredResourcesHints
}
func (p *singleNumaNodePolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
filteredHints := filterProvidersHints(providersHints)
// Filter to only include don't cares and hints with a single NUMA node.
singleNumaHints := filterSingleNumaHints(filteredHints)
merger := NewHintMerger(p.numaInfo, singleNumaHints, p.Name(), p.opts)
bestHint := merger.Merge()
if bestHint.NUMANodeAffinity.IsEqual(p.numaInfo.DefaultAffinityMask()) {
bestHint = TopologyHint{nil, bestHint.Preferred}
}
admit := p.canAdmitPodResult(&bestHint)
return bestHint, admit
}

View File

@ -0,0 +1,158 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"sync"
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
const (
// containerTopologyScope specifies the TopologyManagerScope per container.
containerTopologyScope = "container"
// podTopologyScope specifies the TopologyManagerScope per pod.
podTopologyScope = "pod"
// noneTopologyScope specifies the TopologyManagerScope when topologyPolicyName is none.
noneTopologyScope = "none"
)
type podTopologyHints map[string]map[string]TopologyHint
// Scope interface for Topology Manager
type Scope interface {
Name() string
GetPolicy() Policy
Admit(pod *v1.Pod) lifecycle.PodAdmitResult
// AddHintProvider adds a hint provider to manager to indicate the hint provider
// wants to be consoluted with when making topology hints
AddHintProvider(h HintProvider)
// AddContainer adds pod to Manager for tracking
AddContainer(pod *v1.Pod, container *v1.Container, containerID string)
// RemoveContainer removes pod from Manager tracking
RemoveContainer(containerID string) error
// Store is the interface for storing pod topology hints
Store
}
type scope struct {
mutex sync.Mutex
name string
// Mapping of a Pods mapping of Containers and their TopologyHints
// Indexed by PodUID to ContainerName
podTopologyHints podTopologyHints
// The list of components registered with the Manager
hintProviders []HintProvider
// Topology Manager Policy
policy Policy
// Mapping of (PodUid, ContainerName) to ContainerID for Adding/Removing Pods from PodTopologyHints mapping
podMap containermap.ContainerMap
}
func (s *scope) Name() string {
return s.name
}
func (s *scope) getTopologyHints(podUID string, containerName string) TopologyHint {
s.mutex.Lock()
defer s.mutex.Unlock()
return s.podTopologyHints[podUID][containerName]
}
func (s *scope) setTopologyHints(podUID string, containerName string, th TopologyHint) {
s.mutex.Lock()
defer s.mutex.Unlock()
if s.podTopologyHints[podUID] == nil {
s.podTopologyHints[podUID] = make(map[string]TopologyHint)
}
s.podTopologyHints[podUID][containerName] = th
}
func (s *scope) GetAffinity(podUID string, containerName string) TopologyHint {
return s.getTopologyHints(podUID, containerName)
}
func (s *scope) GetPolicy() Policy {
return s.policy
}
func (s *scope) AddHintProvider(h HintProvider) {
s.hintProviders = append(s.hintProviders, h)
}
// It would be better to implement this function in topologymanager instead of scope
// but topologymanager do not track mapping anymore
func (s *scope) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
s.mutex.Lock()
defer s.mutex.Unlock()
s.podMap.Add(string(pod.UID), container.Name, containerID)
}
// It would be better to implement this function in topologymanager instead of scope
// but topologymanager do not track mapping anymore
func (s *scope) RemoveContainer(containerID string) error {
s.mutex.Lock()
defer s.mutex.Unlock()
klog.InfoS("RemoveContainer", "containerID", containerID)
// Get the podUID and containerName associated with the containerID to be removed and remove it
podUIDString, containerName, err := s.podMap.GetContainerRef(containerID)
if err != nil {
return nil
}
s.podMap.RemoveByContainerID(containerID)
// In cases where a container has been restarted, it's possible that the same podUID and
// containerName are already associated with a *different* containerID now. Only remove
// the TopologyHints associated with that podUID and containerName if this is not true
if _, err := s.podMap.GetContainerID(podUIDString, containerName); err != nil {
delete(s.podTopologyHints[podUIDString], containerName)
if len(s.podTopologyHints[podUIDString]) == 0 {
delete(s.podTopologyHints, podUIDString)
}
}
return nil
}
func (s *scope) admitPolicyNone(pod *v1.Pod) lifecycle.PodAdmitResult {
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
err := s.allocateAlignedResources(pod, &container)
if err != nil {
return admission.GetPodAdmitResult(err)
}
}
return admission.GetPodAdmitResult(nil)
}
// It would be better to implement this function in topologymanager instead of scope
// but topologymanager do not track providers anymore
func (s *scope) allocateAlignedResources(pod *v1.Pod, container *v1.Container) error {
for _, provider := range s.hintProviders {
err := provider.Allocate(pod, container)
if err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,89 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
type containerScope struct {
scope
}
// Ensure containerScope implements Scope interface
var _ Scope = &containerScope{}
// NewContainerScope returns a container scope.
func NewContainerScope(policy Policy) Scope {
return &containerScope{
scope{
name: containerTopologyScope,
podTopologyHints: podTopologyHints{},
policy: policy,
podMap: containermap.NewContainerMap(),
},
}
}
func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
bestHint, admit := s.calculateAffinity(pod, &container)
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
if !admit {
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{})
}
klog.InfoS("Topology Affinity", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
s.setTopologyHints(string(pod.UID), container.Name, bestHint)
err := s.allocateAlignedResources(pod, &container)
if err != nil {
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(err)
}
if IsAlignmentGuaranteed(s.policy) {
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
}
}
return admission.GetPodAdmitResult(nil)
}
func (s *containerScope) accumulateProvidersHints(pod *v1.Pod, container *v1.Container) []map[string][]TopologyHint {
var providersHints []map[string][]TopologyHint
for _, provider := range s.hintProviders {
// Get the TopologyHints for a Container from a provider.
hints := provider.GetTopologyHints(pod, container)
providersHints = append(providersHints, hints)
klog.InfoS("TopologyHints", "hints", hints, "pod", klog.KObj(pod), "containerName", container.Name)
}
return providersHints
}
func (s *containerScope) calculateAffinity(pod *v1.Pod, container *v1.Container) (TopologyHint, bool) {
providersHints := s.accumulateProvidersHints(pod, container)
bestHint, admit := s.policy.Merge(providersHints)
klog.InfoS("ContainerTopologyHint", "bestHint", bestHint)
return bestHint, admit
}

View File

@ -0,0 +1,46 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
type noneScope struct {
scope
}
// Ensure noneScope implements Scope interface
var _ Scope = &noneScope{}
// NewNoneScope returns a none scope.
func NewNoneScope() Scope {
return &noneScope{
scope{
name: noneTopologyScope,
podTopologyHints: podTopologyHints{},
policy: NewNonePolicy(),
podMap: containermap.NewContainerMap(),
},
}
}
func (s *noneScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
return s.admitPolicyNone(pod)
}

View File

@ -0,0 +1,89 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
type podScope struct {
scope
}
// Ensure podScope implements Scope interface
var _ Scope = &podScope{}
// NewPodScope returns a pod scope.
func NewPodScope(policy Policy) Scope {
return &podScope{
scope{
name: podTopologyScope,
podTopologyHints: podTopologyHints{},
policy: policy,
podMap: containermap.NewContainerMap(),
},
}
}
func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
bestHint, admit := s.calculateAffinity(pod)
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
if !admit {
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{})
}
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
klog.InfoS("Topology Affinity", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
s.setTopologyHints(string(pod.UID), container.Name, bestHint)
err := s.allocateAlignedResources(pod, &container)
if err != nil {
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(err)
}
}
if IsAlignmentGuaranteed(s.policy) {
// increment only if we know we allocate aligned resources.
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
}
return admission.GetPodAdmitResult(nil)
}
func (s *podScope) accumulateProvidersHints(pod *v1.Pod) []map[string][]TopologyHint {
var providersHints []map[string][]TopologyHint
for _, provider := range s.hintProviders {
// Get the TopologyHints for a Pod from a provider.
hints := provider.GetPodTopologyHints(pod)
providersHints = append(providersHints, hints)
klog.InfoS("TopologyHints", "hints", hints, "pod", klog.KObj(pod))
}
return providersHints
}
func (s *podScope) calculateAffinity(pod *v1.Pod) (TopologyHint, bool) {
providersHints := s.accumulateProvidersHints(pod)
bestHint, admit := s.policy.Merge(providersHints)
klog.InfoS("PodTopologyHint", "bestHint", bestHint)
return bestHint, admit
}

View File

@ -0,0 +1,222 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topologymanager
import (
"fmt"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
const (
// defaultMaxAllowableNUMANodes specifies the maximum number of NUMA Nodes that
// the TopologyManager supports on the underlying machine.
//
// At present, having more than this number of NUMA Nodes will result in a
// state explosion when trying to enumerate possible NUMAAffinity masks and
// generate hints for them. As such, if more NUMA Nodes than this are
// present on a machine and the TopologyManager is enabled, an error will
// be returned and the TopologyManager will not be loaded.
defaultMaxAllowableNUMANodes = 8
// ErrorTopologyAffinity represents the type for a TopologyAffinityError
ErrorTopologyAffinity = "TopologyAffinityError"
)
// TopologyAffinityError represents an resource alignment error
type TopologyAffinityError struct{}
func (e TopologyAffinityError) Error() string {
return "Resources cannot be allocated with Topology locality"
}
func (e TopologyAffinityError) Type() string {
return ErrorTopologyAffinity
}
// Manager interface provides methods for Kubelet to manage pod topology hints
type Manager interface {
// PodAdmitHandler is implemented by Manager
lifecycle.PodAdmitHandler
// AddHintProvider adds a hint provider to manager to indicate the hint provider
// wants to be consulted with when making topology hints
AddHintProvider(HintProvider)
// AddContainer adds pod to Manager for tracking
AddContainer(pod *v1.Pod, container *v1.Container, containerID string)
// RemoveContainer removes pod from Manager tracking
RemoveContainer(containerID string) error
// Store is the interface for storing pod topology hints
Store
}
type manager struct {
//Topology Manager Scope
scope Scope
}
// HintProvider is an interface for components that want to collaborate to
// achieve globally optimal concrete resource alignment with respect to
// NUMA locality.
type HintProvider interface {
// GetTopologyHints returns a map of resource names to a list of possible
// concrete resource allocations in terms of NUMA locality hints. Each hint
// is optionally marked "preferred" and indicates the set of NUMA nodes
// involved in the hypothetical allocation. The topology manager calls
// this function for each hint provider, and merges the hints to produce
// a consensus "best" hint. The hint providers may subsequently query the
// topology manager to influence actual resource assignment.
GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]TopologyHint
// GetPodTopologyHints returns a map of resource names to a list of possible
// concrete resource allocations per Pod in terms of NUMA locality hints.
GetPodTopologyHints(pod *v1.Pod) map[string][]TopologyHint
// Allocate triggers resource allocation to occur on the HintProvider after
// all hints have been gathered and the aggregated Hint is available via a
// call to Store.GetAffinity().
Allocate(pod *v1.Pod, container *v1.Container) error
}
// Store interface is to allow Hint Providers to retrieve pod affinity
type Store interface {
GetAffinity(podUID string, containerName string) TopologyHint
GetPolicy() Policy
}
// TopologyHint is a struct containing the NUMANodeAffinity for a Container
type TopologyHint struct {
NUMANodeAffinity bitmask.BitMask
// Preferred is set to true when the NUMANodeAffinity encodes a preferred
// allocation for the Container. It is set to false otherwise.
Preferred bool
}
// IsEqual checks if TopologyHint are equal
func (th *TopologyHint) IsEqual(topologyHint TopologyHint) bool {
if th.Preferred == topologyHint.Preferred {
if th.NUMANodeAffinity == nil || topologyHint.NUMANodeAffinity == nil {
return th.NUMANodeAffinity == topologyHint.NUMANodeAffinity
}
return th.NUMANodeAffinity.IsEqual(topologyHint.NUMANodeAffinity)
}
return false
}
// LessThan checks if TopologyHint `a` is less than TopologyHint `b`
// this means that either `a` is a preferred hint and `b` is not
// or `a` NUMANodeAffinity attribute is narrower than `b` NUMANodeAffinity attribute.
func (th *TopologyHint) LessThan(other TopologyHint) bool {
if th.Preferred != other.Preferred {
return th.Preferred
}
return th.NUMANodeAffinity.IsNarrowerThan(other.NUMANodeAffinity)
}
var _ Manager = &manager{}
// NewManager creates a new TopologyManager based on provided policy and scope
func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topologyScopeName string, topologyPolicyOptions map[string]string) (Manager, error) {
// When policy is none, the scope is not relevant, so we can short circuit here.
if topologyPolicyName == PolicyNone {
klog.InfoS("Creating topology manager with none policy")
return &manager{scope: NewNoneScope()}, nil
}
opts, err := NewPolicyOptions(topologyPolicyOptions)
if err != nil {
return nil, err
}
klog.InfoS("Creating topology manager with policy per scope", "topologyPolicyName", topologyPolicyName, "topologyScopeName", topologyScopeName, "topologyPolicyOptions", opts)
numaInfo, err := NewNUMAInfo(topology, opts)
if err != nil {
return nil, fmt.Errorf("cannot discover NUMA topology: %w", err)
}
if topologyPolicyName != PolicyNone && len(numaInfo.Nodes) > opts.MaxAllowableNUMANodes {
return nil, fmt.Errorf("unsupported on machines with more than %v NUMA Nodes", opts.MaxAllowableNUMANodes)
}
var policy Policy
switch topologyPolicyName {
case PolicyBestEffort:
policy = NewBestEffortPolicy(numaInfo, opts)
case PolicyRestricted:
policy = NewRestrictedPolicy(numaInfo, opts)
case PolicySingleNumaNode:
policy = NewSingleNumaNodePolicy(numaInfo, opts)
default:
return nil, fmt.Errorf("unknown policy: \"%s\"", topologyPolicyName)
}
var scope Scope
switch topologyScopeName {
case containerTopologyScope:
scope = NewContainerScope(policy)
case podTopologyScope:
scope = NewPodScope(policy)
default:
return nil, fmt.Errorf("unknown scope: \"%s\"", topologyScopeName)
}
manager := &manager{
scope: scope,
}
return manager, nil
}
func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
return m.scope.GetAffinity(podUID, containerName)
}
func (m *manager) GetPolicy() Policy {
return m.scope.GetPolicy()
}
func (m *manager) AddHintProvider(h HintProvider) {
m.scope.AddHintProvider(h)
}
func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
m.scope.AddContainer(pod, container, containerID)
}
func (m *manager) RemoveContainer(containerID string) error {
return m.scope.RemoveContainer(containerID)
}
func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
metrics.TopologyManagerAdmissionRequestsTotal.Inc()
startTime := time.Now()
podAdmitResult := m.scope.Admit(attrs.Pod)
metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))
return podAdmitResult
}

140
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/types.go generated vendored Normal file
View File

@ -0,0 +1,140 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/cpuset"
)
// ResourceConfig holds information about all the supported cgroup resource parameters.
type ResourceConfig struct {
// Memory limit (in bytes).
Memory *int64
// CPU set (number of CPUs the cgroup has access to).
CPUSet cpuset.CPUSet
// CPU shares (relative weight vs. other containers).
CPUShares *uint64
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CPUQuota *int64
// CPU quota period.
CPUPeriod *uint64
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
// Maximum number of pids
PidsLimit *int64
// Unified for cgroup v2
Unified map[string]string
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
// It is specified as a list of strings from its individual components, such as:
// {"kubepods", "burstable", "pod1234-abcd-5678-efgh"}
type CgroupName []string
// CgroupConfig holds the cgroup configuration information.
// This is common object which is used to specify
// cgroup information to both systemd and raw cgroup fs
// implementation of the Cgroup Manager interface.
type CgroupConfig struct {
// Fully qualified name prior to any driver specific conversions.
Name CgroupName
// ResourceParameters contains various cgroups settings to apply.
ResourceParameters *ResourceConfig
}
// CgroupManager allows for cgroup management.
// Supports Cgroup Creation ,Deletion and Updates.
type CgroupManager interface {
// Create creates and applies the cgroup configurations on the cgroup.
// It just creates the leaf cgroups.
// It expects the parent cgroup to already exist.
Create(*CgroupConfig) error
// Destroy the cgroup.
Destroy(*CgroupConfig) error
// Update cgroup configuration.
Update(*CgroupConfig) error
// Validate checks if the cgroup is valid
Validate(name CgroupName) error
// Exists checks if the cgroup already exists
Exists(name CgroupName) bool
// Name returns the literal cgroupfs name on the host after any driver specific conversions.
// We would expect systemd implementation to make appropriate name conversion.
// For example, if we pass {"foo", "bar"}
// then systemd should convert the name to something like
// foo.slice/foo-bar.slice
Name(name CgroupName) string
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
CgroupName(name string) CgroupName
// Pids scans through all subsystems to find pids associated with specified cgroup.
Pids(name CgroupName) []int
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(cgroupName CgroupName) error
// MemoryUsage returns current memory usage of the specified cgroup, as read from the cgroupfs.
MemoryUsage(name CgroupName) (int64, error)
// Get the resource config values applied to the cgroup for specified resource type
GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error)
// Set resource config for the specified resource type on the cgroup
SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error
// Version of the cgroup implementation on the host
Version() int
}
// QOSContainersInfo stores the names of containers per qos
type QOSContainersInfo struct {
Guaranteed CgroupName
BestEffort CgroupName
Burstable CgroupName
}
// PodContainerManager stores and manages pod level containers
// The Pod workers interact with the PodContainerManager to create and destroy
// containers for the pod.
type PodContainerManager interface {
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
GetPodContainerName(*v1.Pod) (CgroupName, string)
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod cgroup doesn't already exist this method creates it.
EnsureExists(*v1.Pod) error
// Exists returns true if the pod cgroup exists.
Exists(*v1.Pod) bool
// Destroy takes a pod Cgroup name as argument and destroys the pod's container.
Destroy(name CgroupName) error
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(name CgroupName) error
// GetAllPodsFromCgroups enumerates the set of pod uids to their associated cgroup based on state of cgroupfs system.
GetAllPodsFromCgroups() (map[types.UID]CgroupName, error)
// IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
IsPodCgroup(cgroupfs string) (bool, types.UID)
// Get value of memory usage for the pod Cgroup
GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error)
// Get the resource config values applied to the pod cgroup for specified resource type
GetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName) (*ResourceConfig, error)
// Set resource config values for the specified resource type on the pod cgroup
SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error
}

View File

@ -0,0 +1,94 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"path/filepath"
libcontainerutils "k8s.io/kubernetes/third_party/forked/libcontainer/utils"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
)
const (
// CgroupRoot is the base path where cgroups are mounted
CgroupRoot = "/sys/fs/cgroup"
)
// GetPids gets pids of the desired cgroup
// Forked from opencontainers/runc/libcontainer/cgroup/fs.Manager.GetPids()
func GetPids(cgroupPath string) ([]int, error) {
dir := ""
if libcontainercgroups.IsCgroup2UnifiedMode() {
path, err := filepath.Rel("/", cgroupPath)
if err != nil {
return nil, err
}
dir = filepath.Join(CgroupRoot, path)
} else {
var err error
dir, err = getCgroupV1Path(cgroupPath)
if err != nil {
return nil, err
}
}
return libcontainercgroups.GetPids(dir)
}
// getCgroupV1Path gets the file path to the "devices" subsystem of the desired cgroup.
// cgroupPath is the path in the cgroup hierarchy.
func getCgroupV1Path(cgroupPath string) (string, error) {
cgroupPath = libcontainerutils.CleanPath(cgroupPath)
mnt, root, err := libcontainercgroups.FindCgroupMountpointAndRoot(cgroupPath, "devices")
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgroupPath) {
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, mnt, cgroupPath), nil
}
parentPath, err := getCgroupV1ParentPath(mnt, root)
if err != nil {
return "", err
}
return filepath.Join(parentPath, cgroupPath), nil
}
// getCgroupV1ParentPath gets the parent filepath to this cgroup, for resolving relative cgroup paths.
func getCgroupV1ParentPath(mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := libcontainercgroups.GetOwnCgroup("devices")
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see paths from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}

View File

@ -0,0 +1,25 @@
//go:build !linux
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
// GetPids gets pids of the desired cgroup
func GetPids(cgroupPath string) ([]int, error) {
return nil, nil
}