rebase: update K8s packages to v0.32.1

Update K8s packages in go.mod to v0.32.1

Signed-off-by: Praveen M <m.praveen@ibm.com>
This commit is contained in:
Praveen M
2025-01-16 09:41:46 +05:30
committed by mergify[bot]
parent 5aef21ea4e
commit 7eb99fc6c9
2442 changed files with 273386 additions and 47788 deletions

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers: []
reviewers:
- klueska
emeritus_approvers:
- vishh
- jiayingz

View File

@ -0,0 +1,109 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checkpoint
import (
"encoding/json"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
// DeviceManagerCheckpoint defines the operations to retrieve pod devices
type DeviceManagerCheckpoint interface {
checkpointmanager.Checkpoint
GetData() ([]PodDevicesEntry, map[string][]string)
}
// DevicesPerNUMA represents device ids obtained from device plugin per NUMA node id
type DevicesPerNUMA map[int64][]string
// PodDevicesEntry connects pod information to devices
type PodDevicesEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs DevicesPerNUMA
AllocResp []byte
}
// checkpointData struct is used to store pod to device allocation information
// in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []PodDevicesEntry
RegisteredDevices map[string][]string
}
// Data holds checkpoint data and its checksum
type Data struct {
Data checkpointData
Checksum checksum.Checksum
}
// NewDevicesPerNUMA is a function that creates DevicesPerNUMA map
func NewDevicesPerNUMA() DevicesPerNUMA {
return make(DevicesPerNUMA)
}
// Devices is a function that returns all device ids for all NUMA nodes
// and represent it as sets.Set[string]
func (dev DevicesPerNUMA) Devices() sets.Set[string] {
result := sets.New[string]()
for _, devs := range dev {
result.Insert(devs...)
}
return result
}
// New returns an instance of Checkpoint - must be an alias for the most recent version
func New(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return newV2(devEntries, devices)
}
func newV2(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return &Data{
Data: checkpointData{
PodDeviceEntries: devEntries,
RegisteredDevices: devices,
},
}
}
// MarshalCheckpoint returns marshalled data
func (cp *Data) MarshalCheckpoint() ([]byte, error) {
cp.Checksum = checksum.New(cp.Data)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint returns unmarshalled data
func (cp *Data) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that passed checksum is same as calculated checksum
func (cp *Data) VerifyChecksum() error {
return cp.Checksum.Verify(cp.Data)
}
// GetData returns device entries and registered devices in the *most recent*
// checkpoint format, *not* in the original format stored on disk.
func (cp *Data) GetData() ([]PodDevicesEntry, map[string][]string) {
return cp.Data.PodDeviceEntries, cp.Data.RegisteredDevices
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"context"
"fmt"
"sync"
"time"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
)
// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error)
allocate(devs []string) (*pluginapi.AllocateResponse, error)
preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
setStopTime(t time.Time)
isStopped() bool
stopGracePeriodExpired() bool
}
type endpointImpl struct {
mutex sync.Mutex
resourceName string
api pluginapi.DevicePluginClient
stopTime time.Time
client plugin.Client // for testing only
}
// newEndpointImpl creates a new endpoint for the given resourceName.
// This is to be used during normal device plugin registration.
func newEndpointImpl(p plugin.DevicePlugin) *endpointImpl {
return &endpointImpl{
api: p.API(),
resourceName: p.Resource(),
}
}
// newStoppedEndpointImpl creates a new endpoint for the given resourceName with stopTime set.
// This is to be used during Kubelet restart, before the actual device plugin re-registers.
func newStoppedEndpointImpl(resourceName string) *endpointImpl {
return &endpointImpl{
resourceName: resourceName,
stopTime: time.Now(),
}
}
func (e *endpointImpl) isStopped() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero()
}
func (e *endpointImpl) stopGracePeriodExpired() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero() && time.Since(e.stopTime) > endpointStopGracePeriod
}
func (e *endpointImpl) setStopTime(t time.Time) {
e.mutex.Lock()
defer e.mutex.Unlock()
e.stopTime = t
}
// getPreferredAllocation issues GetPreferredAllocation gRPC call to the device plugin.
func (e *endpointImpl) getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.GetPreferredAllocation(context.Background(), &pluginapi.PreferredAllocationRequest{
ContainerRequests: []*pluginapi.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: available,
MustIncludeDeviceIDs: mustInclude,
AllocationSize: int32(size),
},
},
})
}
// allocate issues Allocate gRPC call to the device plugin.
func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.Allocate(context.Background(), &pluginapi.AllocateRequest{
ContainerRequests: []*pluginapi.ContainerAllocateRequest{
{DevicesIDs: devs},
},
})
}
// preStartContainer issues PreStartContainer gRPC call to the device plugin.
func (e *endpointImpl) preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
ctx, cancel := context.WithTimeout(context.Background(), pluginapi.KubeletPreStartContainerRPCTimeoutInSecs*time.Second)
defer cancel()
return e.api.PreStartContainer(ctx, &pluginapi.PreStartContainerRequest{
DevicesIDs: devs,
})
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// RegistrationHandler is an interface for handling device plugin registration
// and plugin directory cleanup.
type RegistrationHandler interface {
CleanupPluginDirectory(string) error
}
// ClientHandler is an interface for handling device plugin connections.
type ClientHandler interface {
PluginConnected(string, DevicePlugin) error
PluginDisconnected(string)
PluginListAndWatchReceiver(string, *api.ListAndWatchResponse)
}
// TODO: evaluate whether we need these error definitions.
const (
// errFailedToDialDevicePlugin is the error raised when the device plugin could not be
// reached on the registered socket
errFailedToDialDevicePlugin = "failed to dial device plugin:"
// errUnsupportedVersion is the error raised when the device plugin uses an API version not
// supported by the Kubelet registry
errUnsupportedVersion = "requested API version %q is not supported by kubelet. Supported version is %q"
// errInvalidResourceName is the error raised when a device plugin is registering
// itself with an invalid ResourceName
errInvalidResourceName = "the ResourceName %q is invalid"
// errBadSocket is the error raised when the registry socket path is not absolute
errBadSocket = "bad socketPath, must be an absolute path:"
)

View File

@ -0,0 +1,143 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// DevicePlugin interface provides methods for accessing Device Plugin resources, API and unix socket.
type DevicePlugin interface {
API() api.DevicePluginClient
Resource() string
SocketPath() string
}
// Client interface provides methods for establishing/closing gRPC connection and running the device plugin gRPC client.
type Client interface {
Connect() error
Run()
Disconnect() error
}
type client struct {
mutex sync.Mutex
resource string
socket string
grpc *grpc.ClientConn
handler ClientHandler
client api.DevicePluginClient
}
// NewPluginClient returns an initialized device plugin client.
func NewPluginClient(r string, socketPath string, h ClientHandler) Client {
return &client{
resource: r,
socket: socketPath,
handler: h,
}
}
// Connect is for establishing a gRPC connection between device manager and device plugin.
func (c *client) Connect() error {
client, conn, err := dial(c.socket)
if err != nil {
klog.ErrorS(err, "Unable to connect to device plugin client with socket path", "path", c.socket)
return err
}
c.mutex.Lock()
c.grpc = conn
c.client = client
c.mutex.Unlock()
return c.handler.PluginConnected(c.resource, c)
}
// Run is for running the device plugin gRPC client.
func (c *client) Run() {
stream, err := c.client.ListAndWatch(context.Background(), &api.Empty{})
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
for {
response, err := stream.Recv()
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
klog.V(2).InfoS("State pushed for device plugin", "resource", c.resource, "resourceCapacity", len(response.Devices))
c.handler.PluginListAndWatchReceiver(c.resource, response)
}
}
// Disconnect is for closing gRPC connection between device manager and device plugin.
func (c *client) Disconnect() error {
c.mutex.Lock()
if c.grpc != nil {
if err := c.grpc.Close(); err != nil {
klog.V(2).ErrorS(err, "Failed to close grcp connection", "resource", c.Resource())
}
c.grpc = nil
}
c.mutex.Unlock()
c.handler.PluginDisconnected(c.resource)
return nil
}
func (c *client) Resource() string {
return c.resource
}
func (c *client) API() api.DevicePluginClient {
return c.client
}
func (c *client) SocketPath() string {
return c.socket
}
// dial establishes the gRPC communication with the registered device plugin. https://godoc.org/google.golang.org/grpc#Dial
func dial(unixSocketPath string) (api.DevicePluginClient, *grpc.ClientConn, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
c, err := grpc.DialContext(ctx, unixSocketPath,
grpc.WithAuthority("localhost"),
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}),
)
if err != nil {
return nil, nil, fmt.Errorf(errFailedToDialDevicePlugin+" %v", err)
}
return api.NewDevicePluginClient(c), c, nil
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"fmt"
"os"
"time"
core "k8s.io/api/core/v1"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
func (s *server) GetPluginHandler() cache.PluginHandler {
if f, err := os.Create(s.socketDir + "DEPRECATION"); err != nil {
klog.ErrorS(err, "Failed to create deprecation file at socket dir", "path", s.socketDir)
} else {
f.Close()
klog.V(4).InfoS("Created deprecation file", "path", f.Name())
}
return s
}
func (s *server) RegisterPlugin(pluginName string, endpoint string, versions []string, pluginClientTimeout *time.Duration) error {
klog.V(2).InfoS("Registering plugin at endpoint", "plugin", pluginName, "endpoint", endpoint)
return s.connectClient(pluginName, endpoint)
}
func (s *server) DeRegisterPlugin(pluginName string) {
klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName)
client := s.getClient(pluginName)
if client != nil {
s.disconnectClient(pluginName, client)
}
}
func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
klog.V(2).InfoS("Got plugin at endpoint with versions", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
if !s.isVersionCompatibleWithPlugin(versions...) {
return fmt.Errorf("manager version, %s, is not among plugin supported versions %v", api.Version, versions)
}
if !v1helper.IsExtendedResourceName(core.ResourceName(pluginName)) {
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
}
return nil
}
func (s *server) connectClient(name string, socketPath string) error {
c := NewPluginClient(name, socketPath, s.chandler)
s.registerClient(name, c)
if err := c.Connect(); err != nil {
s.deregisterClient(name)
klog.ErrorS(err, "Failed to connect to new client", "resource", name)
return err
}
go func() {
s.runClient(name, c)
}()
return nil
}
func (s *server) disconnectClient(name string, c Client) error {
s.deregisterClient(name)
return c.Disconnect()
}
func (s *server) registerClient(name string, c Client) {
s.mutex.Lock()
defer s.mutex.Unlock()
s.clients[name] = c
klog.V(2).InfoS("Registered client", "name", name)
}
func (s *server) deregisterClient(name string) {
s.mutex.Lock()
defer s.mutex.Unlock()
delete(s.clients, name)
klog.V(2).InfoS("Deregistered client", "name", name)
}
func (s *server) runClient(name string, c Client) {
c.Run()
c = s.getClient(name)
if c == nil {
return
}
if err := s.disconnectClient(name, c); err != nil {
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
}
}
func (s *server) getClient(name string) Client {
s.mutex.Lock()
defer s.mutex.Unlock()
return s.clients[name]
}

View File

@ -0,0 +1,224 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"sync"
"github.com/opencontainers/selinux/go-selinux"
"google.golang.org/grpc"
core "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// Server interface provides methods for Device plugin registration server.
type Server interface {
cache.PluginHandler
healthz.HealthChecker
Start() error
Stop() error
SocketPath() string
}
type server struct {
socketName string
socketDir string
mutex sync.Mutex
wg sync.WaitGroup
grpc *grpc.Server
rhandler RegistrationHandler
chandler ClientHandler
clients map[string]Client
// isStarted indicates whether the service has started successfully.
isStarted bool
}
// NewServer returns an initialized device plugin registration server.
func NewServer(socketPath string, rh RegistrationHandler, ch ClientHandler) (Server, error) {
if socketPath == "" || !filepath.IsAbs(socketPath) {
return nil, fmt.Errorf(errBadSocket+" %s", socketPath)
}
dir, name := filepath.Split(socketPath)
klog.V(2).InfoS("Creating device plugin registration server", "version", api.Version, "socket", socketPath)
s := &server{
socketName: name,
socketDir: dir,
rhandler: rh,
chandler: ch,
clients: make(map[string]Client),
}
return s, nil
}
func (s *server) Start() error {
klog.V(2).InfoS("Starting device plugin registration server")
if err := os.MkdirAll(s.socketDir, 0750); err != nil {
klog.ErrorS(err, "Failed to create the device plugin socket directory", "directory", s.socketDir)
return err
}
if selinux.GetEnabled() {
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
}
}
// For now, we leave cleanup of the *entire* directory up to the Handler
// (even though we should in theory be able to just wipe the whole directory)
// because the Handler stores its checkpoint file (amongst others) in here.
if err := s.rhandler.CleanupPluginDirectory(s.socketDir); err != nil {
klog.ErrorS(err, "Failed to cleanup the device plugin directory", "directory", s.socketDir)
return err
}
ln, err := net.Listen("unix", s.SocketPath())
if err != nil {
klog.ErrorS(err, "Failed to listen to socket while starting device plugin registry")
return err
}
s.wg.Add(1)
s.grpc = grpc.NewServer([]grpc.ServerOption{}...)
api.RegisterRegistrationServer(s.grpc, s)
go func() {
defer s.wg.Done()
s.setHealthy()
if err = s.grpc.Serve(ln); err != nil {
s.setUnhealthy()
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
return nil
}
func (s *server) Stop() error {
s.visitClients(func(r string, c Client) {
if err := s.disconnectClient(r, c); err != nil {
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
}
})
s.mutex.Lock()
defer s.mutex.Unlock()
if s.grpc == nil {
return nil
}
s.grpc.Stop()
s.wg.Wait()
s.grpc = nil
// During kubelet termination, we do not need the registration server,
// and we consider the kubelet to be healthy even when it is down.
s.setHealthy()
return nil
}
func (s *server) SocketPath() string {
return filepath.Join(s.socketDir, s.socketName)
}
func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Empty, error) {
klog.InfoS("Got registration request from device plugin with resource", "resourceName", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
if !s.isVersionCompatibleWithPlugin(r.Version) {
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
return &api.Empty{}, err
}
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
klog.InfoS("Bad registration request from device plugin", "err", err)
return &api.Empty{}, err
}
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
klog.InfoS("Error connecting to device plugin client", "err", err)
return &api.Empty{}, err
}
return &api.Empty{}, nil
}
func (s *server) isVersionCompatibleWithPlugin(versions ...string) bool {
// TODO(vikasc): Currently this is fine as we only have a single supported version. When we do need to support
// multiple versions in the future, we may need to extend this function to return a supported version.
// E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
// this function should return v1beta1
for _, version := range versions {
for _, supportedVersion := range api.SupportedVersions {
if version == supportedVersion {
return true
}
}
}
return false
}
func (s *server) visitClients(visit func(r string, c Client)) {
s.mutex.Lock()
for r, c := range s.clients {
s.mutex.Unlock()
visit(r, c)
s.mutex.Lock()
}
s.mutex.Unlock()
}
func (s *server) Name() string {
return "device-plugin"
}
func (s *server) Check(_ *http.Request) error {
if s.isStarted {
return nil
}
return fmt.Errorf("device plugin registration gRPC server failed and no device plugins can register")
}
// setHealthy sets the health status of the gRPC server.
func (s *server) setHealthy() {
s.isStarted = true
}
// setUnhealthy sets the health status of the gRPC server to unhealthy.
func (s *server) setUnhealthy() {
s.isStarted = false
}

View File

@ -0,0 +1,388 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
watcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
)
// Stub implementation for DevicePlugin.
type Stub struct {
devs []*pluginapi.Device
socket string
resourceName string
preStartContainerFlag bool
getPreferredAllocationFlag bool
stop chan interface{}
wg sync.WaitGroup
update chan []*pluginapi.Device
server *grpc.Server
// allocFunc is used for handling allocation request
allocFunc stubAllocFunc
// getPreferredAllocFunc is used for handling getPreferredAllocation request
getPreferredAllocFunc stubGetPreferredAllocFunc
// registerControlFunc is used for controlling auto-registration of requests
registerControlFunc stubRegisterControlFunc
registrationStatus chan watcherapi.RegistrationStatus // for testing
endpoint string // for testing
kubeletRestartWatcher *fsnotify.Watcher
}
// stubGetPreferredAllocFunc is the function called when a getPreferredAllocation request is received from Kubelet
type stubGetPreferredAllocFunc func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error)
func defaultGetPreferredAllocFunc(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
var response pluginapi.PreferredAllocationResponse
return &response, nil
}
// stubAllocFunc is the function called when an allocation request is received from Kubelet
type stubAllocFunc func(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error)
func defaultAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
var response pluginapi.AllocateResponse
return &response, nil
}
// stubRegisterControlFunc is the function called when a registration request is received from Kubelet
type stubRegisterControlFunc func() bool
func defaultRegisterControlFunc() bool {
return true
}
// NewDevicePluginStub returns an initialized DevicePlugin Stub.
func NewDevicePluginStub(devs []*pluginapi.Device, socket string, name string, preStartContainerFlag bool, getPreferredAllocationFlag bool) *Stub {
watcher, err := fsnotify.NewWatcher()
if err != nil {
klog.ErrorS(err, "Watcher creation failed")
panic(err)
}
return &Stub{
devs: devs,
socket: socket,
resourceName: name,
preStartContainerFlag: preStartContainerFlag,
getPreferredAllocationFlag: getPreferredAllocationFlag,
registerControlFunc: defaultRegisterControlFunc,
stop: make(chan interface{}),
update: make(chan []*pluginapi.Device),
allocFunc: defaultAllocFunc,
getPreferredAllocFunc: defaultGetPreferredAllocFunc,
kubeletRestartWatcher: watcher,
}
}
// SetGetPreferredAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetGetPreferredAllocFunc(f stubGetPreferredAllocFunc) {
m.getPreferredAllocFunc = f
}
// SetAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetAllocFunc(f stubAllocFunc) {
m.allocFunc = f
}
// SetRegisterControlFunc sets RegisterControlFunc of the device plugin
func (m *Stub) SetRegisterControlFunc(f stubRegisterControlFunc) {
m.registerControlFunc = f
}
// Start starts the gRPC server of the device plugin. Can only
// be called once.
func (m *Stub) Start() error {
klog.InfoS("Starting device plugin server")
err := m.cleanup()
if err != nil {
return err
}
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
m.wg.Add(1)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
watcherapi.RegisterRegistrationServer(m.server, m)
err = m.kubeletRestartWatcher.Add(filepath.Dir(m.socket))
if err != nil {
klog.ErrorS(err, "Failed to add watch", "devicePluginPath", pluginapi.DevicePluginPath)
return err
}
go func() {
defer m.wg.Done()
if err = m.server.Serve(sock); err != nil {
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
var lastDialErr error
wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) {
var conn *grpc.ClientConn
_, conn, lastDialErr = dial(m.socket)
if lastDialErr != nil {
return false, nil
}
conn.Close()
return true, nil
})
if lastDialErr != nil {
return lastDialErr
}
klog.InfoS("Starting to serve on socket", "socket", m.socket)
return nil
}
func (m *Stub) Restart() error {
klog.InfoS("Restarting Device Plugin server")
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
return m.Start()
}
// Stop stops the gRPC server. Can be called without a prior Start
// and more than once. Not safe to be called concurrently by different
// goroutines!
func (m *Stub) Stop() error {
klog.InfoS("Stopping device plugin server")
if m.server == nil {
return nil
}
m.kubeletRestartWatcher.Close()
m.server.Stop()
m.wg.Wait()
m.server = nil
close(m.stop) // This prevents re-starting the server.
return m.cleanup()
}
func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) {
for {
select {
// Detect a kubelet restart by watching for a newly created
// 'pluginapi.KubeletSocket' file. When this occurs, restart
// the device plugin server
case event := <-m.kubeletRestartWatcher.Events:
if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create {
klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint)
var lastErr error
err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) {
restartErr := m.Restart()
if restartErr == nil {
return true, nil
}
klog.ErrorS(restartErr, "Retrying after error")
lastErr = restartErr
return false, nil
})
if err != nil {
klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error())
panic(err)
}
if ok := m.registerControlFunc(); ok {
if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil {
klog.ErrorS(err, "Unable to register to kubelet")
panic(err)
}
}
}
// Watch for any other fs errors and log them.
case err := <-m.kubeletRestartWatcher.Errors:
klog.ErrorS(err, "inotify error")
}
}
}
// GetInfo is the RPC which return pluginInfo
func (m *Stub) GetInfo(ctx context.Context, req *watcherapi.InfoRequest) (*watcherapi.PluginInfo, error) {
klog.InfoS("GetInfo")
return &watcherapi.PluginInfo{
Type: watcherapi.DevicePlugin,
Name: m.resourceName,
Endpoint: m.endpoint,
SupportedVersions: []string{pluginapi.Version}}, nil
}
// NotifyRegistrationStatus receives the registration notification from watcher
func (m *Stub) NotifyRegistrationStatus(ctx context.Context, status *watcherapi.RegistrationStatus) (*watcherapi.RegistrationStatusResponse, error) {
if m.registrationStatus != nil {
m.registrationStatus <- *status
}
if !status.PluginRegistered {
klog.InfoS("Registration failed", "err", status.Error)
}
return &watcherapi.RegistrationStatusResponse{}, nil
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *Stub) Register(kubeletEndpoint, resourceName string, pluginSockDir string) error {
klog.InfoS("Register", "kubeletEndpoint", kubeletEndpoint, "resourceName", resourceName, "socket", pluginSockDir)
if pluginSockDir != "" {
if _, err := os.Stat(pluginSockDir + "DEPRECATION"); err == nil {
klog.InfoS("Deprecation file found. Skip registration")
return nil
}
}
klog.InfoS("Deprecation file not found. Invoke registration")
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := grpc.DialContext(ctx, kubeletEndpoint,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}))
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: filepath.Base(m.socket),
ResourceName: resourceName,
Options: &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
},
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
// Stop server
m.server.Stop()
klog.ErrorS(err, "Client unable to register to kubelet")
return err
}
klog.InfoS("Device Plugin registered with the Kubelet")
return err
}
// GetDevicePluginOptions returns DevicePluginOptions settings for the device plugin.
func (m *Stub) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
options := &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
}
return options, nil
}
// PreStartContainer resets the devices received
func (m *Stub) PreStartContainer(ctx context.Context, r *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
klog.InfoS("PreStartContainer", "request", r)
return &pluginapi.PreStartContainerResponse{}, nil
}
// ListAndWatch lists devices and update that list according to the Update call
func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
klog.InfoS("ListAndWatch")
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case updated := <-m.update:
s.Send(&pluginapi.ListAndWatchResponse{Devices: updated})
}
}
}
// Update allows the device plugin to send new devices through ListAndWatch
func (m *Stub) Update(devs []*pluginapi.Device) {
m.update <- devs
}
// GetPreferredAllocation gets the preferred allocation from a set of available devices
func (m *Stub) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
klog.InfoS("GetPreferredAllocation", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.getPreferredAllocFunc(r, devs)
}
// Allocate does a mock allocation
func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
klog.InfoS("Allocate", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.allocFunc(r, devs)
}
func (m *Stub) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}

View File

@ -0,0 +1,456 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"sync"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type deviceAllocateInfo struct {
// deviceIds contains device Ids allocated to this container for the given resourceName.
deviceIds checkpoint.DevicesPerNUMA
// allocResp contains cached rpc AllocateResponse.
allocResp *pluginapi.ContainerAllocateResponse
}
type resourceAllocateInfo map[string]deviceAllocateInfo // Keyed by resourceName.
type containerDevices map[string]resourceAllocateInfo // Keyed by containerName.
type podDevices struct {
sync.RWMutex
devs map[string]containerDevices // Keyed by podUID.
}
// NewPodDevices is a function that returns object of podDevices type with its own guard
// RWMutex and a map where key is a pod UID and value contains
// container devices information of type containerDevices.
func newPodDevices() *podDevices {
return &podDevices{devs: make(map[string]containerDevices)}
}
func (pdev *podDevices) pods() sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for k := range pdev.devs {
ret.Insert(k)
}
return ret
}
func (pdev *podDevices) size() int {
pdev.RLock()
defer pdev.RUnlock()
return len(pdev.devs)
}
func (pdev *podDevices) hasPod(podUID string) bool {
pdev.RLock()
defer pdev.RUnlock()
_, podExists := pdev.devs[podUID]
return podExists
}
func (pdev *podDevices) insert(podUID, contName, resource string, devices checkpoint.DevicesPerNUMA, resp *pluginapi.ContainerAllocateResponse) {
pdev.Lock()
defer pdev.Unlock()
if _, podExists := pdev.devs[podUID]; !podExists {
pdev.devs[podUID] = make(containerDevices)
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
pdev.devs[podUID][contName] = make(resourceAllocateInfo)
}
pdev.devs[podUID][contName][resource] = deviceAllocateInfo{
deviceIds: devices,
allocResp: resp,
}
}
func (pdev *podDevices) delete(pods []string) {
pdev.Lock()
defer pdev.Unlock()
for _, uid := range pods {
delete(pdev.devs, uid)
}
}
// Returns list of device Ids allocated to the given pod for the given resource.
// Returns nil if we don't have cached state for the given <podUID, resource>.
func (pdev *podDevices) podDevices(podUID, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for contName := range pdev.devs[podUID] {
ret = ret.Union(pdev.containerDevices(podUID, contName, resource))
}
return ret
}
// Returns list of device Ids allocated to the given container for the given resource.
// Returns nil if we don't have cached state for the given <podUID, contName, resource>.
func (pdev *podDevices) containerDevices(podUID, contName, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
devs, resourceExists := pdev.devs[podUID][contName][resource]
if !resourceExists {
return nil
}
return devs.deviceIds.Devices()
}
// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
func (pdev *podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds.Devices())
}
}
// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
func (pdev *podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds.Devices())
}
}
// Returns all devices allocated to the pods being tracked, keyed by resourceName.
func (pdev *podDevices) devices() map[string]sets.Set[string] {
ret := make(map[string]sets.Set[string])
pdev.RLock()
defer pdev.RUnlock()
for _, containerDevices := range pdev.devs {
for _, resources := range containerDevices {
for resource, devices := range resources {
if _, exists := ret[resource]; !exists {
ret[resource] = sets.New[string]()
}
if devices.allocResp != nil {
ret[resource] = ret[resource].Union(devices.deviceIds.Devices())
}
}
}
}
return ret
}
// Returns podUID and containerName for a device
func (pdev *podDevices) getPodAndContainerForDevice(deviceID string) (string, string) {
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for containerName, resources := range containerDevices {
for _, devices := range resources {
if devices.deviceIds.Devices().Has(deviceID) {
return podUID, containerName
}
}
}
}
return "", ""
}
// Turns podDevices to checkpointData.
func (pdev *podDevices) toCheckpointData() []checkpoint.PodDevicesEntry {
var data []checkpoint.PodDevicesEntry
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for conName, resources := range containerDevices {
for resource, devices := range resources {
if devices.allocResp == nil {
klog.ErrorS(nil, "Can't marshal allocResp, allocation response is missing", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
allocResp, err := devices.allocResp.Marshal()
if err != nil {
klog.ErrorS(err, "Can't marshal allocResp", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
data = append(data, checkpoint.PodDevicesEntry{
PodUID: podUID,
ContainerName: conName,
ResourceName: resource,
DeviceIDs: devices.deviceIds,
AllocResp: allocResp})
}
}
}
return data
}
// Populates podDevices from the passed in checkpointData.
func (pdev *podDevices) fromCheckpointData(data []checkpoint.PodDevicesEntry) {
for _, entry := range data {
klog.V(2).InfoS("Get checkpoint entry",
"podUID", entry.PodUID, "containerName", entry.ContainerName,
"resourceName", entry.ResourceName, "deviceIDs", entry.DeviceIDs, "allocated", entry.AllocResp)
allocResp := &pluginapi.ContainerAllocateResponse{}
err := allocResp.Unmarshal(entry.AllocResp)
if err != nil {
klog.ErrorS(err, "Can't unmarshal allocResp", "podUID", entry.PodUID, "containerName", entry.ContainerName, "resourceName", entry.ResourceName)
continue
}
pdev.insert(entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceIDs, allocResp)
}
}
// Returns combined container runtime settings to consume the container's allocated devices.
func (pdev *podDevices) deviceRunContainerOptions(podUID, contName string) *DeviceRunContainerOptions {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return nil
}
resources, exists := containers[contName]
if !exists {
return nil
}
opts := &DeviceRunContainerOptions{}
// Maps to detect duplicate settings.
devsMap := make(map[string]string)
mountsMap := make(map[string]string)
envsMap := make(map[string]string)
annotationsMap := make(map[string]string)
// Keep track of all CDI devices requested for the container.
allCDIDevices := sets.New[string]()
// Loops through AllocationResponses of all cached device resources.
for _, devices := range resources {
resp := devices.allocResp
// Each Allocate response has the following artifacts.
// Environment variables
// Mount points
// Device files
// Container annotations
// CDI device IDs
// These artifacts are per resource per container.
// Updates RunContainerOptions.Envs.
for k, v := range resp.Envs {
if e, ok := envsMap[k]; ok {
klog.V(4).InfoS("Skip existing env", "envKey", k, "envValue", v)
if e != v {
klog.ErrorS(nil, "Environment variable has conflicting setting", "envKey", k, "expected", v, "got", e)
}
continue
}
klog.V(4).InfoS("Add env", "envKey", k, "envValue", v)
envsMap[k] = v
opts.Envs = append(opts.Envs, kubecontainer.EnvVar{Name: k, Value: v})
}
// Updates RunContainerOptions.Devices.
for _, dev := range resp.Devices {
if d, ok := devsMap[dev.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
if d != dev.HostPath {
klog.ErrorS(nil, "Container device has conflicting mapping host devices",
"containerPath", dev.ContainerPath, "got", d, "expected", dev.HostPath)
}
continue
}
klog.V(4).InfoS("Add device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
devsMap[dev.ContainerPath] = dev.HostPath
opts.Devices = append(opts.Devices, kubecontainer.DeviceInfo{
PathOnHost: dev.HostPath,
PathInContainer: dev.ContainerPath,
Permissions: dev.Permissions,
})
}
// Updates RunContainerOptions.Mounts.
for _, mount := range resp.Mounts {
if m, ok := mountsMap[mount.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
if m != mount.HostPath {
klog.ErrorS(nil, "Container mount has conflicting mapping host mounts",
"containerPath", mount.ContainerPath, "conflictingPath", m, "hostPath", mount.HostPath)
}
continue
}
klog.V(4).InfoS("Add mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
mountsMap[mount.ContainerPath] = mount.HostPath
opts.Mounts = append(opts.Mounts, kubecontainer.Mount{
Name: mount.ContainerPath,
ContainerPath: mount.ContainerPath,
HostPath: mount.HostPath,
ReadOnly: mount.ReadOnly,
// TODO: This may need to be part of Device plugin API.
SELinuxRelabel: false,
})
}
// Updates for Annotations
for k, v := range resp.Annotations {
if e, ok := annotationsMap[k]; ok {
klog.V(4).InfoS("Skip existing annotation", "annotationKey", k, "annotationValue", v)
if e != v {
klog.ErrorS(nil, "Annotation has conflicting setting", "annotationKey", k, "expected", e, "got", v)
}
continue
}
klog.V(4).InfoS("Add annotation", "annotationKey", k, "annotationValue", v)
annotationsMap[k] = v
opts.Annotations = append(opts.Annotations, kubecontainer.Annotation{Name: k, Value: v})
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DevicePluginCDIDevices) {
// Updates for CDI devices.
cdiDevices := getCDIDeviceInfo(resp, allCDIDevices)
opts.CDIDevices = append(opts.CDIDevices, cdiDevices...)
}
}
return opts
}
// getCDIDeviceInfo returns CDI devices from an allocate response
func getCDIDeviceInfo(resp *pluginapi.ContainerAllocateResponse, knownCDIDevices sets.Set[string]) []kubecontainer.CDIDevice {
var cdiDevices []kubecontainer.CDIDevice
for _, cdiDevice := range resp.CDIDevices {
if knownCDIDevices.Has(cdiDevice.Name) {
klog.V(4).InfoS("Skip existing CDI Device", "name", cdiDevice.Name)
continue
}
klog.V(4).InfoS("Add CDI device", "name", cdiDevice.Name)
knownCDIDevices.Insert(cdiDevice.Name)
device := kubecontainer.CDIDevice{
Name: cdiDevice.Name,
}
cdiDevices = append(cdiDevices, device)
}
return cdiDevices
}
// getContainerDevices returns the devices assigned to the provided container for all ResourceNames
func (pdev *podDevices) getContainerDevices(podUID, contName string) ResourceDeviceInstances {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
resDev := NewResourceDeviceInstances()
for resource, allocateInfo := range pdev.devs[podUID][contName] {
if len(allocateInfo.deviceIds) == 0 {
continue
}
devicePluginMap := make(map[string]pluginapi.Device)
for numaid, devlist := range allocateInfo.deviceIds {
for _, devID := range devlist {
var topology *pluginapi.TopologyInfo
if numaid != nodeWithoutTopology {
NUMANodes := []*pluginapi.NUMANode{{ID: numaid}}
if pDev, ok := devicePluginMap[devID]; ok && pDev.Topology != nil {
if nodes := pDev.Topology.GetNodes(); nodes != nil {
NUMANodes = append(NUMANodes, nodes...)
}
}
// ID and Healthy are not relevant here.
topology = &pluginapi.TopologyInfo{Nodes: NUMANodes}
}
devicePluginMap[devID] = pluginapi.Device{
Topology: topology,
}
}
}
resDev[resource] = devicePluginMap
}
return resDev
}
// DeviceInstances is a mapping device name -> plugin device data
type DeviceInstances map[string]pluginapi.Device
// ResourceDeviceInstances is a mapping resource name -> DeviceInstances
type ResourceDeviceInstances map[string]DeviceInstances
// NewResourceDeviceInstances returns a new ResourceDeviceInstances
func NewResourceDeviceInstances() ResourceDeviceInstances {
return make(ResourceDeviceInstances)
}
// Clone returns a clone of ResourceDeviceInstances
func (rdev ResourceDeviceInstances) Clone() ResourceDeviceInstances {
clone := NewResourceDeviceInstances()
for resourceName, resourceDevs := range rdev {
clone[resourceName] = make(map[string]pluginapi.Device)
for devID, dev := range resourceDevs {
clone[resourceName][devID] = dev
}
}
return clone
}
// Filter takes a condition set expressed as map[string]sets.Set[string] and returns a new
// ResourceDeviceInstances with only the devices matching the condition set.
func (rdev ResourceDeviceInstances) Filter(cond map[string]sets.Set[string]) ResourceDeviceInstances {
filtered := NewResourceDeviceInstances()
for resourceName, filterIDs := range cond {
if _, exists := rdev[resourceName]; !exists {
continue
}
filtered[resourceName] = DeviceInstances{}
for instanceID, instance := range rdev[resourceName] {
if filterIDs.Has(instanceID) {
filtered[resourceName][instanceID] = instance
}
}
}
return filtered
}

View File

@ -0,0 +1,252 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/resource"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
// GetTopologyHints implements the TopologyManager HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for each
// container are created.
func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
// Loop through all device resources and generate TopologyHints for them.
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getContainerDeviceRequest(container)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Container. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
reusable := m.devicesToReuse[string(pod.UID)][resource]
if available.Union(reusable).Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, reusable, requested)
}
return deviceHints
}
// GetPodTopologyHints implements the topologymanager.HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for Pod are created.
func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getPodDeviceRequest(pod)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Pod. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.podDevices(string(pod.UID), resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
if available.Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, sets.Set[string]{}, requested)
}
return deviceHints
}
func (m *ManagerImpl) deviceHasTopologyAlignment(resource string) bool {
// If any device has Topology NUMANodes available, we assume they care about alignment.
for _, device := range m.allDevices[resource] {
if device.Topology != nil && len(device.Topology.Nodes) > 0 {
return true
}
}
return false
}
func (m *ManagerImpl) getAvailableDevices(resource string) sets.Set[string] {
// Strip all devices in use from the list of healthy ones.
return m.healthyDevices[resource].Difference(m.allocatedDevices[resource])
}
func (m *ManagerImpl) generateDeviceTopologyHints(resource string, available sets.Set[string], reusable sets.Set[string], request int) []topologymanager.TopologyHint {
// Initialize minAffinitySize to include all NUMA Nodes
minAffinitySize := len(m.numaNodes)
// Iterate through all combinations of NUMA Nodes and build hints from them.
hints := []topologymanager.TopologyHint{}
bitmask.IterateBitMasks(m.numaNodes, func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
devicesInMask := 0
for _, device := range m.allDevices[resource] {
if mask.AnySet(m.getNUMANodeIds(device.Topology)) {
devicesInMask++
}
}
if devicesInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if all the reusable devices are part of the bitmask.
numMatching := 0
for d := range reusable {
// Skip the device if it doesn't specify any topology info.
if m.allDevices[resource][d].Topology == nil {
continue
}
// Otherwise disregard this mask if its NUMANode isn't part of it.
if !mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
return
}
numMatching++
}
// Finally, check to see if enough available devices remain on the
// current NUMA node combination to satisfy the device request.
for d := range available {
if mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the NUMA mask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, topologymanager.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinity. Only those with an equal number of bits set will be
// considered preferred.
for i := range hints {
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
func (m *ManagerImpl) getNUMANodeIds(topology *pluginapi.TopologyInfo) []int {
if topology == nil {
return nil
}
var ids []int
for _, n := range topology.Nodes {
ids = append(ids, int(n.ID))
}
return ids
}
func (m *ManagerImpl) getPodDeviceRequest(pod *v1.Pod) map[string]int {
// for these device plugin resources, requests == limits
limits := resource.PodLimits(pod, resource.PodResourcesOptions{
ExcludeOverhead: true,
})
podRequests := make(map[string]int)
for resourceName, quantity := range limits {
if !m.isDevicePluginResource(string(resourceName)) {
continue
}
podRequests[string(resourceName)] = int(quantity.Value())
}
return podRequests
}
func (m *ManagerImpl) getContainerDeviceRequest(container *v1.Container) map[string]int {
containerRequests := make(map[string]int)
for resourceObj, requestedObj := range container.Resources.Limits {
resource := string(resourceObj)
requested := int(requestedObj.Value())
if !m.isDevicePluginResource(resource) {
continue
}
containerRequests[resource] = requested
}
return containerRequests
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
// Manager manages all the Device Plugins running on a node.
type Manager interface {
// Start starts device plugin registration service.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error
// Allocate configures and assigns devices to a container in a pod. From
// the requested device resources, Allocate will communicate with the
// owning device plugin to allow setup procedures to take place, and for
// the device plugin to provide runtime settings to use the device
// (environment variables, mount points and device files).
Allocate(pod *v1.Pod, container *v1.Container) error
// UpdatePluginResources updates node resources based on devices already
// allocated to pods. The node object is provided for the device manager to
// update the node capacity to reflect the currently available devices.
UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
// Stop stops the manager.
Stop() error
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error)
// GetCapacity returns the amount of available device plugin resource capacity, resource allocatable
// and inactive device plugin resources previously registered on the node.
GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
// GetWatcherHandler returns the plugin handler for the device manager.
GetWatcherHandler() cache.PluginHandler
GetHealthChecker() healthz.HealthChecker
// GetDevices returns information about the devices assigned to pods and containers
GetDevices(podUID, containerName string) ResourceDeviceInstances
// UpdateAllocatedResourcesStatus updates the status of allocated resources for the pod.
UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus)
// GetAllocatableDevices returns information about all the devices known to the manager
GetAllocatableDevices() ResourceDeviceInstances
// ShouldResetExtendedResourceCapacity returns whether the extended resources should be reset or not,
// depending on the checkpoint file availability. Absence of the checkpoint file strongly indicates
// the node has been recreated.
ShouldResetExtendedResourceCapacity() bool
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments
GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments per Pod
GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint
// UpdateAllocatedDevices frees any Devices that are bound to terminated pods.
UpdateAllocatedDevices()
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
}
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
type DeviceRunContainerOptions struct {
// The environment variables list.
Envs []kubecontainer.EnvVar
// The mounts for the container.
Mounts []kubecontainer.Mount
// The host devices mapped into the container.
Devices []kubecontainer.DeviceInfo
// The Annotations for the container
Annotations []kubecontainer.Annotation
// CDI Devices for the container
CDIDevices []kubecontainer.CDIDevice
}
// TODO: evaluate whether we need this error definition.
const (
errEndpointStopped = "endpoint %v has been stopped"
)
// endpointStopGracePeriod indicates the grace period after an endpoint is stopped
// because its device plugin fails. DeviceManager keeps the stopped endpoint in its
// cache during this grace period to cover the time gap for the capacity change to
// take effect.
const endpointStopGracePeriod = time.Duration(5) * time.Minute
// kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint
const kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"