rebase: update K8s packages to v0.32.1

Update K8s packages in go.mod to v0.32.1

Signed-off-by: Praveen M <m.praveen@ibm.com>
This commit is contained in:
Praveen M
2025-01-16 09:41:46 +05:30
committed by mergify[bot]
parent 5aef21ea4e
commit 7eb99fc6c9
2442 changed files with 273386 additions and 47788 deletions

View File

@ -0,0 +1,760 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"context"
"errors"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
var (
cleanAssumedPeriod = 1 * time.Second
)
// New returns a Cache implementation.
// It automatically starts a go routine that manages expiration of assumed pods.
// "ttl" is how long the assumed pod will get expired.
// "ctx" is the context that would close the background goroutine.
func New(ctx context.Context, ttl time.Duration) Cache {
logger := klog.FromContext(ctx)
cache := newCache(ctx, ttl, cleanAssumedPeriod)
cache.run(logger)
return cache
}
// nodeInfoListItem holds a NodeInfo pointer and acts as an item in a doubly
// linked list. When a NodeInfo is updated, it goes to the head of the list.
// The items closer to the head are the most recently updated items.
type nodeInfoListItem struct {
info *framework.NodeInfo
next *nodeInfoListItem
prev *nodeInfoListItem
}
type cacheImpl struct {
stop <-chan struct{}
ttl time.Duration
period time.Duration
// This mutex guards all fields within this cache struct.
mu sync.RWMutex
// a set of assumed pod keys.
// The key could further be used to get an entry in podStates.
assumedPods sets.Set[string]
// a map from pod key to podState.
podStates map[string]*podState
nodes map[string]*nodeInfoListItem
// headNode points to the most recently updated NodeInfo in "nodes". It is the
// head of the linked list.
headNode *nodeInfoListItem
nodeTree *nodeTree
// A map from image name to its ImageStateSummary.
imageStates map[string]*framework.ImageStateSummary
}
type podState struct {
pod *v1.Pod
// Used by assumedPod to determinate expiration.
// If deadline is nil, assumedPod will never expire.
deadline *time.Time
// Used to block cache from expiring assumedPod if binding still runs
bindingFinished bool
}
func newCache(ctx context.Context, ttl, period time.Duration) *cacheImpl {
logger := klog.FromContext(ctx)
return &cacheImpl{
ttl: ttl,
period: period,
stop: ctx.Done(),
nodes: make(map[string]*nodeInfoListItem),
nodeTree: newNodeTree(logger, nil),
assumedPods: sets.New[string](),
podStates: make(map[string]*podState),
imageStates: make(map[string]*framework.ImageStateSummary),
}
}
// newNodeInfoListItem initializes a new nodeInfoListItem.
func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
return &nodeInfoListItem{
info: ni,
}
}
// moveNodeInfoToHead moves a NodeInfo to the head of "cache.nodes" doubly
// linked list. The head is the most recently updated NodeInfo.
// We assume cache lock is already acquired.
func (cache *cacheImpl) moveNodeInfoToHead(logger klog.Logger, name string) {
ni, ok := cache.nodes[name]
if !ok {
logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
// if the node info list item is already at the head, we are done.
if ni == cache.headNode {
return
}
if ni.prev != nil {
ni.prev.next = ni.next
}
if ni.next != nil {
ni.next.prev = ni.prev
}
if cache.headNode != nil {
cache.headNode.prev = ni
}
ni.next = cache.headNode
ni.prev = nil
cache.headNode = ni
}
// removeNodeInfoFromList removes a NodeInfo from the "cache.nodes" doubly
// linked list.
// We assume cache lock is already acquired.
func (cache *cacheImpl) removeNodeInfoFromList(logger klog.Logger, name string) {
ni, ok := cache.nodes[name]
if !ok {
logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
if ni.prev != nil {
ni.prev.next = ni.next
}
if ni.next != nil {
ni.next.prev = ni.prev
}
// if the removed item was at the head, we must update the head.
if ni == cache.headNode {
cache.headNode = ni.next
}
delete(cache.nodes, name)
}
// Dump produces a dump of the current scheduler cache. This is used for
// debugging purposes only and shouldn't be confused with UpdateSnapshot
// function.
// This method is expensive, and should be only used in non-critical path.
func (cache *cacheImpl) Dump() *Dump {
cache.mu.RLock()
defer cache.mu.RUnlock()
nodes := make(map[string]*framework.NodeInfo, len(cache.nodes))
for k, v := range cache.nodes {
nodes[k] = v.info.Snapshot()
}
return &Dump{
Nodes: nodes,
AssumedPods: cache.assumedPods.Union(nil),
}
}
// UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at
// beginning of every scheduling cycle.
// The snapshot only includes Nodes that are not deleted at the time this function is called.
// nodeInfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
// This function tracks generation number of NodeInfo and updates only the
// entries of an existing snapshot that have changed after the snapshot was taken.
func (cache *cacheImpl) UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error {
cache.mu.Lock()
defer cache.mu.Unlock()
// Get the last generation of the snapshot.
snapshotGeneration := nodeSnapshot.generation
// NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added
// or removed from the cache.
updateAllLists := false
// HavePodsWithAffinityNodeInfoList must be re-created if a node changed its
// status from having pods with affinity to NOT having pods with affinity or the other
// way around.
updateNodesHavePodsWithAffinity := false
// HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its
// status from having pods with required anti-affinity to NOT having pods with required
// anti-affinity or the other way around.
updateNodesHavePodsWithRequiredAntiAffinity := false
// usedPVCSet must be re-created whenever the head node generation is greater than
// last snapshot generation.
updateUsedPVCSet := false
// Start from the head of the NodeInfo doubly linked list and update snapshot
// of NodeInfos updated after the last snapshot.
for node := cache.headNode; node != nil; node = node.next {
if node.info.Generation <= snapshotGeneration {
// all the nodes are updated before the existing snapshot. We are done.
break
}
if np := node.info.Node(); np != nil {
existing, ok := nodeSnapshot.nodeInfoMap[np.Name]
if !ok {
updateAllLists = true
existing = &framework.NodeInfo{}
nodeSnapshot.nodeInfoMap[np.Name] = existing
}
clone := node.info.Snapshot()
// We track nodes that have pods with affinity, here we check if this node changed its
// status from having pods with affinity to NOT having pods with affinity or the other
// way around.
if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) {
updateNodesHavePodsWithAffinity = true
}
if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) {
updateNodesHavePodsWithRequiredAntiAffinity = true
}
if !updateUsedPVCSet {
if len(existing.PVCRefCounts) != len(clone.PVCRefCounts) {
updateUsedPVCSet = true
} else {
for pvcKey := range clone.PVCRefCounts {
if _, found := existing.PVCRefCounts[pvcKey]; !found {
updateUsedPVCSet = true
break
}
}
}
}
// We need to preserve the original pointer of the NodeInfo struct since it
// is used in the NodeInfoList, which we may not update.
*existing = *clone
}
}
// Update the snapshot generation with the latest NodeInfo generation.
if cache.headNode != nil {
nodeSnapshot.generation = cache.headNode.info.Generation
}
// Comparing to pods in nodeTree.
// Deleted nodes get removed from the tree, but they might remain in the nodes map
// if they still have non-deleted Pods.
if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {
cache.removeDeletedNodesFromSnapshot(nodeSnapshot)
updateAllLists = true
}
if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity || updateUsedPVCSet {
cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, updateAllLists)
}
if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes {
errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+
", length of NodeInfoMap=%v, length of nodes in cache=%v"+
", trying to recover",
len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
logger.Error(nil, errMsg)
// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, true)
return errors.New(errMsg)
}
return nil
}
func (cache *cacheImpl) updateNodeInfoSnapshotList(logger klog.Logger, snapshot *Snapshot, updateAll bool) {
snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
snapshot.usedPVCSet = sets.New[string]()
if updateAll {
// Take a snapshot of the nodes order in the tree
snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
nodesList, err := cache.nodeTree.list()
if err != nil {
logger.Error(err, "Error occurred while retrieving the list of names of the nodes from node tree")
}
for _, nodeName := range nodesList {
if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo)
if len(nodeInfo.PodsWithAffinity) > 0 {
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
}
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
}
for key := range nodeInfo.PVCRefCounts {
snapshot.usedPVCSet.Insert(key)
}
} else {
logger.Error(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
}
}
} else {
for _, nodeInfo := range snapshot.nodeInfoList {
if len(nodeInfo.PodsWithAffinity) > 0 {
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
}
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
}
for key := range nodeInfo.PVCRefCounts {
snapshot.usedPVCSet.Insert(key)
}
}
}
}
// If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot.
func (cache *cacheImpl) removeDeletedNodesFromSnapshot(snapshot *Snapshot) {
toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes
for name := range snapshot.nodeInfoMap {
if toDelete <= 0 {
break
}
if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil {
delete(snapshot.nodeInfoMap, name)
toDelete--
}
}
}
// NodeCount returns the number of nodes in the cache.
// DO NOT use outside of tests.
func (cache *cacheImpl) NodeCount() int {
cache.mu.RLock()
defer cache.mu.RUnlock()
return len(cache.nodes)
}
// PodCount returns the number of pods in the cache (including those from deleted nodes).
// DO NOT use outside of tests.
func (cache *cacheImpl) PodCount() (int, error) {
cache.mu.RLock()
defer cache.mu.RUnlock()
// podFilter is expected to return true for most or all of the pods. We
// can avoid expensive array growth without wasting too much memory by
// pre-allocating capacity.
count := 0
for _, n := range cache.nodes {
count += len(n.info.Pods)
}
return count, nil
}
func (cache *cacheImpl) AssumePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
if _, ok := cache.podStates[key]; ok {
return fmt.Errorf("pod %v(%v) is in the cache, so can't be assumed", key, klog.KObj(pod))
}
return cache.addPod(logger, pod, true)
}
func (cache *cacheImpl) FinishBinding(logger klog.Logger, pod *v1.Pod) error {
return cache.finishBinding(logger, pod, time.Now())
}
// finishBinding exists to make tests deterministic by injecting now as an argument
func (cache *cacheImpl) finishBinding(logger klog.Logger, pod *v1.Pod, now time.Time) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
logger.V(5).Info("Finished binding for pod, can be expired", "podKey", key, "pod", klog.KObj(pod))
currState, ok := cache.podStates[key]
if ok && cache.assumedPods.Has(key) {
if cache.ttl == time.Duration(0) {
currState.deadline = nil
} else {
dl := now.Add(cache.ttl)
currState.deadline = &dl
}
currState.bindingFinished = true
}
return nil
}
func (cache *cacheImpl) ForgetPod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName {
return fmt.Errorf("pod %v(%v) was assumed on %v but assigned to %v", key, klog.KObj(pod), pod.Spec.NodeName, currState.pod.Spec.NodeName)
}
// Only assumed pod can be forgotten.
if ok && cache.assumedPods.Has(key) {
return cache.removePod(logger, pod)
}
return fmt.Errorf("pod %v(%v) wasn't assumed so cannot be forgotten", key, klog.KObj(pod))
}
// Assumes that lock is already acquired.
func (cache *cacheImpl) addPod(logger klog.Logger, pod *v1.Pod, assumePod bool) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[pod.Spec.NodeName] = n
}
n.info.AddPod(pod)
cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
ps := &podState{
pod: pod,
}
cache.podStates[key] = ps
if assumePod {
cache.assumedPods.Insert(key)
}
return nil
}
// Assumes that lock is already acquired.
func (cache *cacheImpl) updatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
if err := cache.removePod(logger, oldPod); err != nil {
return err
}
return cache.addPod(logger, newPod, false)
}
// Assumes that lock is already acquired.
// Removes a pod from the cached node info. If the node information was already
// removed and there are no more pods left in the node, cleans up the node from
// the cache.
func (cache *cacheImpl) removePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
logger.Error(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "podKey", key, "pod", klog.KObj(pod))
} else {
if err := n.info.RemovePod(logger, pod); err != nil {
return err
}
if len(n.info.Pods) == 0 && n.info.Node() == nil {
cache.removeNodeInfoFromList(logger, pod.Spec.NodeName)
} else {
cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
}
}
delete(cache.podStates, key)
delete(cache.assumedPods, key)
return nil
}
func (cache *cacheImpl) AddPod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
switch {
case ok && cache.assumedPods.Has(key):
// When assuming, we've already added the Pod to cache,
// Just update here to make sure the Pod's status is up-to-date.
if err = cache.updatePod(logger, currState.pod, pod); err != nil {
logger.Error(err, "Error occurred while updating pod")
}
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
// The pod was added to a different node than it was assumed to.
logger.Info("Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
return nil
}
case !ok:
// Pod was expired. We should add it back.
if err = cache.addPod(logger, pod, false); err != nil {
logger.Error(err, "Error occurred while adding pod")
}
default:
return fmt.Errorf("pod %v(%v) was already in added state", key, klog.KObj(pod))
}
return nil
}
func (cache *cacheImpl) UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
key, err := framework.GetPodKey(oldPod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if !ok {
return fmt.Errorf("pod %v(%v) is not added to scheduler cache, so cannot be updated", key, klog.KObj(oldPod))
}
// An assumed pod won't have Update/Remove event. It needs to have Add event
// before Update event, in which case the state would change from Assumed to Added.
if cache.assumedPods.Has(key) {
return fmt.Errorf("assumed pod %v(%v) should not be updated", key, klog.KObj(oldPod))
}
if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
logger.Error(nil, "Pod updated on a different node than previously added to", "podKey", key, "pod", klog.KObj(oldPod))
logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
return cache.updatePod(logger, oldPod, newPod)
}
func (cache *cacheImpl) RemovePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if !ok {
return fmt.Errorf("pod %v(%v) is not found in scheduler cache, so cannot be removed from it", key, klog.KObj(pod))
}
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
logger.Error(nil, "Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
if pod.Spec.NodeName != "" {
// An empty NodeName is possible when the scheduler misses a Delete
// event and it gets the last known state from the informer cache.
logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
}
return cache.removePod(logger, currState.pod)
}
func (cache *cacheImpl) IsAssumedPod(pod *v1.Pod) (bool, error) {
key, err := framework.GetPodKey(pod)
if err != nil {
return false, err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
return cache.assumedPods.Has(key), nil
}
// GetPod might return a pod for which its node has already been deleted from
// the main cache. This is useful to properly process pod update events.
func (cache *cacheImpl) GetPod(pod *v1.Pod) (*v1.Pod, error) {
key, err := framework.GetPodKey(pod)
if err != nil {
return nil, err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
podState, ok := cache.podStates[key]
if !ok {
return nil, fmt.Errorf("pod %v(%v) does not exist in scheduler cache", key, klog.KObj(pod))
}
return podState.pod, nil
}
func (cache *cacheImpl) AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[node.Name]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[node.Name] = n
} else {
cache.removeNodeImageStates(n.info.Node())
}
cache.moveNodeInfoToHead(logger, node.Name)
cache.nodeTree.addNode(logger, node)
cache.addNodeImageStates(node, n.info)
n.info.SetNode(node)
return n.info.Snapshot()
}
func (cache *cacheImpl) UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[newNode.Name]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[newNode.Name] = n
cache.nodeTree.addNode(logger, newNode)
} else {
cache.removeNodeImageStates(n.info.Node())
}
cache.moveNodeInfoToHead(logger, newNode.Name)
cache.nodeTree.updateNode(logger, oldNode, newNode)
cache.addNodeImageStates(newNode, n.info)
n.info.SetNode(newNode)
return n.info.Snapshot()
}
// RemoveNode removes a node from the cache's tree.
// The node might still have pods because their deletion events didn't arrive
// yet. Those pods are considered removed from the cache, being the node tree
// the source of truth.
// However, we keep a ghost node with the list of pods until all pod deletion
// events have arrived. A ghost node is skipped from snapshots.
func (cache *cacheImpl) RemoveNode(logger klog.Logger, node *v1.Node) error {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[node.Name]
if !ok {
return fmt.Errorf("node %v is not found", node.Name)
}
n.info.RemoveNode()
// We remove NodeInfo for this node only if there aren't any pods on this node.
// We can't do it unconditionally, because notifications about pods are delivered
// in a different watch, and thus can potentially be observed later, even though
// they happened before node removal.
if len(n.info.Pods) == 0 {
cache.removeNodeInfoFromList(logger, node.Name)
} else {
cache.moveNodeInfoToHead(logger, node.Name)
}
if err := cache.nodeTree.removeNode(logger, node); err != nil {
return err
}
cache.removeNodeImageStates(node)
return nil
}
// addNodeImageStates adds states of the images on given node to the given nodeInfo and update the imageStates in
// scheduler cache. This function assumes the lock to scheduler cache has been acquired.
func (cache *cacheImpl) addNodeImageStates(node *v1.Node, nodeInfo *framework.NodeInfo) {
newSum := make(map[string]*framework.ImageStateSummary)
for _, image := range node.Status.Images {
for _, name := range image.Names {
// update the entry in imageStates
state, ok := cache.imageStates[name]
if !ok {
state = &framework.ImageStateSummary{
Size: image.SizeBytes,
Nodes: sets.New(node.Name),
}
cache.imageStates[name] = state
} else {
state.Nodes.Insert(node.Name)
}
// create the ImageStateSummary for this image
if _, ok := newSum[name]; !ok {
newSum[name] = state
}
}
}
nodeInfo.ImageStates = newSum
}
// removeNodeImageStates removes the given node record from image entries having the node
// in imageStates cache. After the removal, if any image becomes free, i.e., the image
// is no longer available on any node, the image entry will be removed from imageStates.
func (cache *cacheImpl) removeNodeImageStates(node *v1.Node) {
if node == nil {
return
}
for _, image := range node.Status.Images {
for _, name := range image.Names {
state, ok := cache.imageStates[name]
if ok {
state.Nodes.Delete(node.Name)
if state.Nodes.Len() == 0 {
// Remove the unused image to make sure the length of
// imageStates represents the total number of different
// images on all nodes
delete(cache.imageStates, name)
}
}
}
}
}
func (cache *cacheImpl) run(logger klog.Logger) {
go wait.Until(func() {
cache.cleanupAssumedPods(logger, time.Now())
}, cache.period, cache.stop)
}
// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
// It also reports metrics on the cache size for nodes, pods, and assumed pods.
func (cache *cacheImpl) cleanupAssumedPods(logger klog.Logger, now time.Time) {
cache.mu.Lock()
defer cache.mu.Unlock()
defer cache.updateMetrics()
// The size of assumedPods should be small
for key := range cache.assumedPods {
ps, ok := cache.podStates[key]
if !ok {
logger.Error(nil, "Key found in assumed set but not in podStates, potentially a logical error")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
if !ps.bindingFinished {
logger.V(5).Info("Could not expire cache for pod as binding is still in progress", "podKey", key, "pod", klog.KObj(ps.pod))
continue
}
if cache.ttl != 0 && now.After(*ps.deadline) {
logger.Info("Pod expired", "podKey", key, "pod", klog.KObj(ps.pod))
if err := cache.removePod(logger, ps.pod); err != nil {
logger.Error(err, "ExpirePod failed", "podKey", key, "pod", klog.KObj(ps.pod))
}
}
}
}
// updateMetrics updates cache size metric values for pods, assumed pods, and nodes
func (cache *cacheImpl) updateMetrics() {
metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
}

View File

@ -0,0 +1,135 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"sort"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// CacheComparer is an implementation of the Scheduler's cache comparer.
type CacheComparer struct {
NodeLister corelisters.NodeLister
PodLister corelisters.PodLister
Cache internalcache.Cache
PodQueue internalqueue.SchedulingQueue
}
// Compare compares the nodes and pods of NodeLister with Cache.Snapshot.
func (c *CacheComparer) Compare(logger klog.Logger) error {
logger.V(3).Info("Cache comparer started")
defer logger.V(3).Info("Cache comparer finished")
nodes, err := c.NodeLister.List(labels.Everything())
if err != nil {
return err
}
pods, err := c.PodLister.List(labels.Everything())
if err != nil {
return err
}
dump := c.Cache.Dump()
pendingPods, _ := c.PodQueue.PendingPods()
if missed, redundant := c.CompareNodes(nodes, dump.Nodes); len(missed)+len(redundant) != 0 {
logger.Info("Cache mismatch", "missedNodes", missed, "redundantNodes", redundant)
}
if missed, redundant := c.ComparePods(pods, pendingPods, dump.Nodes); len(missed)+len(redundant) != 0 {
logger.Info("Cache mismatch", "missedPods", missed, "redundantPods", redundant)
}
return nil
}
// CompareNodes compares actual nodes with cached nodes.
func (c *CacheComparer) CompareNodes(nodes []*v1.Node, nodeinfos map[string]*framework.NodeInfo) (missed, redundant []string) {
actual := []string{}
for _, node := range nodes {
actual = append(actual, node.Name)
}
cached := []string{}
for nodeName := range nodeinfos {
cached = append(cached, nodeName)
}
return compareStrings(actual, cached)
}
// ComparePods compares actual pods with cached pods.
func (c *CacheComparer) ComparePods(pods, waitingPods []*v1.Pod, nodeinfos map[string]*framework.NodeInfo) (missed, redundant []string) {
actual := []string{}
for _, pod := range pods {
actual = append(actual, string(pod.UID))
}
cached := []string{}
for _, nodeinfo := range nodeinfos {
for _, p := range nodeinfo.Pods {
cached = append(cached, string(p.Pod.UID))
}
}
for _, pod := range waitingPods {
cached = append(cached, string(pod.UID))
}
return compareStrings(actual, cached)
}
func compareStrings(actual, cached []string) (missed, redundant []string) {
missed, redundant = []string{}, []string{}
sort.Strings(actual)
sort.Strings(cached)
compare := func(i, j int) int {
if i == len(actual) {
return 1
} else if j == len(cached) {
return -1
}
return strings.Compare(actual[i], cached[j])
}
for i, j := 0, 0; i < len(actual) || j < len(cached); {
switch compare(i, j) {
case 0:
i++
j++
case -1:
missed = append(missed, actual[i])
i++
case 1:
redundant = append(redundant, cached[j])
j++
}
}
return
}

View File

@ -0,0 +1,76 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"context"
"os"
"os/signal"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue"
)
// CacheDebugger provides ways to check and write cache information for debugging.
type CacheDebugger struct {
Comparer CacheComparer
Dumper CacheDumper
}
// New creates a CacheDebugger.
func New(
nodeLister corelisters.NodeLister,
podLister corelisters.PodLister,
cache internalcache.Cache,
podQueue internalqueue.SchedulingQueue,
) *CacheDebugger {
return &CacheDebugger{
Comparer: CacheComparer{
NodeLister: nodeLister,
PodLister: podLister,
Cache: cache,
PodQueue: podQueue,
},
Dumper: CacheDumper{
cache: cache,
podQueue: podQueue,
},
}
}
// ListenForSignal starts a goroutine that will trigger the CacheDebugger's
// behavior when the process receives SIGINT (Windows) or SIGUSER2 (non-Windows).
func (d *CacheDebugger) ListenForSignal(ctx context.Context) {
logger := klog.FromContext(ctx)
stopCh := ctx.Done()
ch := make(chan os.Signal, 1)
signal.Notify(ch, compareSignal)
go func() {
for {
select {
case <-stopCh:
return
case <-ch:
d.Comparer.Compare(logger)
d.Dumper.DumpAll(logger)
}
}
}()
}

View File

@ -0,0 +1,88 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"fmt"
"strings"
"k8s.io/klog/v2"
v1 "k8s.io/api/core/v1"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// CacheDumper writes some information from the scheduler cache and the scheduling queue to the
// scheduler logs for debugging purposes.
type CacheDumper struct {
cache internalcache.Cache
podQueue queue.SchedulingQueue
}
// DumpAll writes cached nodes and scheduling queue information to the scheduler logs.
func (d *CacheDumper) DumpAll(logger klog.Logger) {
d.dumpNodes(logger)
d.dumpSchedulingQueue(logger)
}
// dumpNodes writes NodeInfo to the scheduler logs.
func (d *CacheDumper) dumpNodes(logger klog.Logger) {
dump := d.cache.Dump()
nodeInfos := make([]string, 0, len(dump.Nodes))
for name, nodeInfo := range dump.Nodes {
nodeInfos = append(nodeInfos, d.printNodeInfo(name, nodeInfo))
}
// Extra blank line added between node entries for readability.
logger.Info("Dump of cached NodeInfo", "nodes", strings.Join(nodeInfos, "\n\n"))
}
// dumpSchedulingQueue writes pods in the scheduling queue to the scheduler logs.
func (d *CacheDumper) dumpSchedulingQueue(logger klog.Logger) {
pendingPods, s := d.podQueue.PendingPods()
var podData strings.Builder
for _, p := range pendingPods {
podData.WriteString(printPod(p))
}
logger.Info("Dump of scheduling queue", "summary", s, "pods", podData.String())
}
// printNodeInfo writes parts of NodeInfo to a string.
func (d *CacheDumper) printNodeInfo(name string, n *framework.NodeInfo) string {
var nodeData strings.Builder
nodeData.WriteString(fmt.Sprintf("Node name: %s\nDeleted: %t\nRequested Resources: %+v\nAllocatable Resources:%+v\nScheduled Pods(number: %v):\n",
name, n.Node() == nil, n.Requested, n.Allocatable, len(n.Pods)))
// Dumping Pod Info
for _, p := range n.Pods {
nodeData.WriteString(printPod(p.Pod))
}
// Dumping nominated pods info on the node
nominatedPodInfos := d.podQueue.NominatedPodsForNode(name)
if len(nominatedPodInfos) != 0 {
nodeData.WriteString(fmt.Sprintf("Nominated Pods(number: %v):\n", len(nominatedPodInfos)))
for _, pi := range nominatedPodInfos {
nodeData.WriteString(printPod(pi.Pod))
}
}
return nodeData.String()
}
// printPod writes parts of a Pod object to a string.
func printPod(p *v1.Pod) string {
return fmt.Sprintf("name: %v, namespace: %v, uid: %v, phase: %v, nominated node: %v\n", p.Name, p.Namespace, p.UID, p.Status.Phase, p.Status.NominatedNodeName)
}

View File

@ -0,0 +1,26 @@
//go:build !windows
// +build !windows
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import "syscall"
// compareSignal is the signal to trigger cache compare. For non-windows
// environment it's SIGUSR2.
var compareSignal = syscall.SIGUSR2

View File

@ -0,0 +1,23 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import "os"
// compareSignal is the signal to trigger cache compare. For windows,
// it's SIGINT.
var compareSignal = os.Interrupt

View File

@ -0,0 +1,123 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// Cache collects pods' information and provides node-level aggregated information.
// It's intended for generic scheduler to do efficient lookup.
// Cache's operations are pod centric. It does incremental updates based on pod events.
// Pod events are sent via network. We don't have guaranteed delivery of all events:
// We use Reflector to list and watch from remote.
// Reflector might be slow and do a relist, which would lead to missing events.
//
// State Machine of a pod's events in scheduler's cache:
//
// +-------------------------------------------+ +----+
// | Add | | |
// | | | | Update
// + Assume Add v v |
//
// Initial +--------> Assumed +------------+---> Added <--+
//
// ^ + + | +
// | | | | |
// | | | Add | | Remove
// | | | | |
// | | | + |
// +----------------+ +-----------> Expired +----> Deleted
// Forget Expire
//
// Note that an assumed pod can expire, because if we haven't received Add event notifying us
// for a while, there might be some problems and we shouldn't keep the pod in cache anymore.
//
// Note that "Initial", "Expired", and "Deleted" pods do not actually exist in cache.
// Based on existing use cases, we are making the following assumptions:
// - No pod would be assumed twice
// - A pod could be added without going through scheduler. In this case, we will see Add but not Assume event.
// - If a pod wasn't added, it wouldn't be removed or updated.
// - Both "Expired" and "Deleted" are valid end states. In case of some problems, e.g. network issue,
// a pod might have changed its state (e.g. added and deleted) without delivering notification to the cache.
type Cache interface {
// NodeCount returns the number of nodes in the cache.
// DO NOT use outside of tests.
NodeCount() int
// PodCount returns the number of pods in the cache (including those from deleted nodes).
// DO NOT use outside of tests.
PodCount() (int, error)
// AssumePod assumes a pod scheduled and aggregates the pod's information into its node.
// The implementation also decides the policy to expire pod before being confirmed (receiving Add event).
// After expiration, its information would be subtracted.
AssumePod(logger klog.Logger, pod *v1.Pod) error
// FinishBinding signals that cache for assumed pod can be expired
FinishBinding(logger klog.Logger, pod *v1.Pod) error
// ForgetPod removes an assumed pod from cache.
ForgetPod(logger klog.Logger, pod *v1.Pod) error
// AddPod either confirms a pod if it's assumed, or adds it back if it's expired.
// If added back, the pod's information would be added again.
AddPod(logger klog.Logger, pod *v1.Pod) error
// UpdatePod removes oldPod's information and adds newPod's information.
UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error
// RemovePod removes a pod. The pod's information would be subtracted from assigned node.
RemovePod(logger klog.Logger, pod *v1.Pod) error
// GetPod returns the pod from the cache with the same namespace and the
// same name of the specified pod.
GetPod(pod *v1.Pod) (*v1.Pod, error)
// IsAssumedPod returns true if the pod is assumed and not expired.
IsAssumedPod(pod *v1.Pod) (bool, error)
// AddNode adds overall information about node.
// It returns a clone of added NodeInfo object.
AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo
// UpdateNode updates overall information about node.
// It returns a clone of updated NodeInfo object.
UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo
// RemoveNode removes overall information about node.
RemoveNode(logger klog.Logger, node *v1.Node) error
// UpdateSnapshot updates the passed infoSnapshot to the current contents of Cache.
// The node info contains aggregated information of pods scheduled (including assumed to be)
// on this node.
// The snapshot only includes Nodes that are not deleted at the time this function is called.
// nodeinfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error
// Dump produces a dump of the current cache.
Dump() *Dump
}
// Dump is a dump of the cache state.
type Dump struct {
AssumedPods sets.Set[string]
Nodes map[string]*framework.NodeInfo
}

View File

@ -0,0 +1,143 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"errors"
"fmt"
v1 "k8s.io/api/core/v1"
utilnode "k8s.io/component-helpers/node/topology"
"k8s.io/klog/v2"
)
// nodeTree is a tree-like data structure that holds node names in each zone. Zone names are
// keys to "NodeTree.tree" and values of "NodeTree.tree" are arrays of node names.
// NodeTree is NOT thread-safe, any concurrent updates/reads from it must be synchronized by the caller.
// It is used only by schedulerCache, and should stay as such.
type nodeTree struct {
tree map[string][]string // a map from zone (region-zone) to an array of nodes in the zone.
zones []string // a list of all the zones in the tree (keys)
numNodes int
}
// newNodeTree creates a NodeTree from nodes.
func newNodeTree(logger klog.Logger, nodes []*v1.Node) *nodeTree {
nt := &nodeTree{
tree: make(map[string][]string, len(nodes)),
}
for _, n := range nodes {
nt.addNode(logger, n)
}
return nt
}
// addNode adds a node and its corresponding zone to the tree. If the zone already exists, the node
// is added to the array of nodes in that zone.
func (nt *nodeTree) addNode(logger klog.Logger, n *v1.Node) {
zone := utilnode.GetZoneKey(n)
if na, ok := nt.tree[zone]; ok {
for _, nodeName := range na {
if nodeName == n.Name {
logger.Info("Did not add to the NodeTree because it already exists", "node", klog.KObj(n))
return
}
}
nt.tree[zone] = append(na, n.Name)
} else {
nt.zones = append(nt.zones, zone)
nt.tree[zone] = []string{n.Name}
}
logger.V(2).Info("Added node to NodeTree", "node", klog.KObj(n), "zone", zone)
nt.numNodes++
}
// removeNode removes a node from the NodeTree.
func (nt *nodeTree) removeNode(logger klog.Logger, n *v1.Node) error {
zone := utilnode.GetZoneKey(n)
if na, ok := nt.tree[zone]; ok {
for i, nodeName := range na {
if nodeName == n.Name {
nt.tree[zone] = append(na[:i], na[i+1:]...)
if len(nt.tree[zone]) == 0 {
nt.removeZone(zone)
}
logger.V(2).Info("Removed node from NodeTree", "node", klog.KObj(n), "zone", zone)
nt.numNodes--
return nil
}
}
}
logger.Error(nil, "Did not remove Node in NodeTree because it was not found", "node", klog.KObj(n), "zone", zone)
return fmt.Errorf("node %q in group %q was not found", n.Name, zone)
}
// removeZone removes a zone from tree.
// This function must be called while writer locks are hold.
func (nt *nodeTree) removeZone(zone string) {
delete(nt.tree, zone)
for i, z := range nt.zones {
if z == zone {
nt.zones = append(nt.zones[:i], nt.zones[i+1:]...)
return
}
}
}
// updateNode updates a node in the NodeTree.
func (nt *nodeTree) updateNode(logger klog.Logger, old, new *v1.Node) {
var oldZone string
if old != nil {
oldZone = utilnode.GetZoneKey(old)
}
newZone := utilnode.GetZoneKey(new)
// If the zone ID of the node has not changed, we don't need to do anything. Name of the node
// cannot be changed in an update.
if oldZone == newZone {
return
}
nt.removeNode(logger, old) // No error checking. We ignore whether the old node exists or not.
nt.addNode(logger, new)
}
// list returns the list of names of the node. NodeTree iterates over zones and in each zone iterates
// over nodes in a round robin fashion.
func (nt *nodeTree) list() ([]string, error) {
if len(nt.zones) == 0 {
return nil, nil
}
nodesList := make([]string, 0, nt.numNodes)
numExhaustedZones := 0
nodeIndex := 0
for len(nodesList) < nt.numNodes {
if numExhaustedZones >= len(nt.zones) { // all zones are exhausted.
return nodesList, errors.New("all zones exhausted before reaching count of nodes expected")
}
for zoneIndex := 0; zoneIndex < len(nt.zones); zoneIndex++ {
na := nt.tree[nt.zones[zoneIndex]]
if nodeIndex >= len(na) { // If the zone is exhausted, continue
if nodeIndex == len(na) { // If it is the first time the zone is exhausted
numExhaustedZones++
}
continue
}
nodesList = append(nodesList, na[nodeIndex])
}
nodeIndex++
}
return nodesList, nil
}

View File

@ -0,0 +1,198 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// Snapshot is a snapshot of cache NodeInfo and NodeTree order. The scheduler takes a
// snapshot at the beginning of each scheduling cycle and uses it for its operations in that cycle.
type Snapshot struct {
// nodeInfoMap a map of node name to a snapshot of its NodeInfo.
nodeInfoMap map[string]*framework.NodeInfo
// nodeInfoList is the list of nodes as ordered in the cache's nodeTree.
nodeInfoList []*framework.NodeInfo
// havePodsWithAffinityNodeInfoList is the list of nodes with at least one pod declaring affinity terms.
havePodsWithAffinityNodeInfoList []*framework.NodeInfo
// havePodsWithRequiredAntiAffinityNodeInfoList is the list of nodes with at least one pod declaring
// required anti-affinity terms.
havePodsWithRequiredAntiAffinityNodeInfoList []*framework.NodeInfo
// usedPVCSet contains a set of PVC names that have one or more scheduled pods using them,
// keyed in the format "namespace/name".
usedPVCSet sets.Set[string]
generation int64
}
var _ framework.SharedLister = &Snapshot{}
// NewEmptySnapshot initializes a Snapshot struct and returns it.
func NewEmptySnapshot() *Snapshot {
return &Snapshot{
nodeInfoMap: make(map[string]*framework.NodeInfo),
usedPVCSet: sets.New[string](),
}
}
// NewSnapshot initializes a Snapshot struct and returns it.
func NewSnapshot(pods []*v1.Pod, nodes []*v1.Node) *Snapshot {
nodeInfoMap := createNodeInfoMap(pods, nodes)
nodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
havePodsWithAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
havePodsWithRequiredAntiAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
for _, v := range nodeInfoMap {
nodeInfoList = append(nodeInfoList, v)
if len(v.PodsWithAffinity) > 0 {
havePodsWithAffinityNodeInfoList = append(havePodsWithAffinityNodeInfoList, v)
}
if len(v.PodsWithRequiredAntiAffinity) > 0 {
havePodsWithRequiredAntiAffinityNodeInfoList = append(havePodsWithRequiredAntiAffinityNodeInfoList, v)
}
}
s := NewEmptySnapshot()
s.nodeInfoMap = nodeInfoMap
s.nodeInfoList = nodeInfoList
s.havePodsWithAffinityNodeInfoList = havePodsWithAffinityNodeInfoList
s.havePodsWithRequiredAntiAffinityNodeInfoList = havePodsWithRequiredAntiAffinityNodeInfoList
s.usedPVCSet = createUsedPVCSet(pods)
return s
}
// createNodeInfoMap obtains a list of pods and pivots that list into a map
// where the keys are node names and the values are the aggregated information
// for that node.
func createNodeInfoMap(pods []*v1.Pod, nodes []*v1.Node) map[string]*framework.NodeInfo {
nodeNameToInfo := make(map[string]*framework.NodeInfo)
for _, pod := range pods {
nodeName := pod.Spec.NodeName
if _, ok := nodeNameToInfo[nodeName]; !ok {
nodeNameToInfo[nodeName] = framework.NewNodeInfo()
}
nodeNameToInfo[nodeName].AddPod(pod)
}
imageExistenceMap := createImageExistenceMap(nodes)
for _, node := range nodes {
if _, ok := nodeNameToInfo[node.Name]; !ok {
nodeNameToInfo[node.Name] = framework.NewNodeInfo()
}
nodeInfo := nodeNameToInfo[node.Name]
nodeInfo.SetNode(node)
nodeInfo.ImageStates = getNodeImageStates(node, imageExistenceMap)
}
return nodeNameToInfo
}
func createUsedPVCSet(pods []*v1.Pod) sets.Set[string] {
usedPVCSet := sets.New[string]()
for _, pod := range pods {
if pod.Spec.NodeName == "" {
continue
}
for _, v := range pod.Spec.Volumes {
if v.PersistentVolumeClaim == nil {
continue
}
key := framework.GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName)
usedPVCSet.Insert(key)
}
}
return usedPVCSet
}
// getNodeImageStates returns the given node's image states based on the given imageExistence map.
func getNodeImageStates(node *v1.Node, imageExistenceMap map[string]sets.Set[string]) map[string]*framework.ImageStateSummary {
imageStates := make(map[string]*framework.ImageStateSummary)
for _, image := range node.Status.Images {
for _, name := range image.Names {
imageStates[name] = &framework.ImageStateSummary{
Size: image.SizeBytes,
NumNodes: imageExistenceMap[name].Len(),
}
}
}
return imageStates
}
// createImageExistenceMap returns a map recording on which nodes the images exist, keyed by the images' names.
func createImageExistenceMap(nodes []*v1.Node) map[string]sets.Set[string] {
imageExistenceMap := make(map[string]sets.Set[string])
for _, node := range nodes {
for _, image := range node.Status.Images {
for _, name := range image.Names {
if _, ok := imageExistenceMap[name]; !ok {
imageExistenceMap[name] = sets.New(node.Name)
} else {
imageExistenceMap[name].Insert(node.Name)
}
}
}
}
return imageExistenceMap
}
// NodeInfos returns a NodeInfoLister.
func (s *Snapshot) NodeInfos() framework.NodeInfoLister {
return s
}
// StorageInfos returns a StorageInfoLister.
func (s *Snapshot) StorageInfos() framework.StorageInfoLister {
return s
}
// NumNodes returns the number of nodes in the snapshot.
func (s *Snapshot) NumNodes() int {
return len(s.nodeInfoList)
}
// List returns the list of nodes in the snapshot.
func (s *Snapshot) List() ([]*framework.NodeInfo, error) {
return s.nodeInfoList, nil
}
// HavePodsWithAffinityList returns the list of nodes with at least one pod with inter-pod affinity
func (s *Snapshot) HavePodsWithAffinityList() ([]*framework.NodeInfo, error) {
return s.havePodsWithAffinityNodeInfoList, nil
}
// HavePodsWithRequiredAntiAffinityList returns the list of nodes with at least one pod with
// required inter-pod anti-affinity
func (s *Snapshot) HavePodsWithRequiredAntiAffinityList() ([]*framework.NodeInfo, error) {
return s.havePodsWithRequiredAntiAffinityNodeInfoList, nil
}
// Get returns the NodeInfo of the given node name.
func (s *Snapshot) Get(nodeName string) (*framework.NodeInfo, error) {
if v, ok := s.nodeInfoMap[nodeName]; ok && v.Node() != nil {
return v, nil
}
return nil, fmt.Errorf("nodeinfo not found for node name %q", nodeName)
}
func (s *Snapshot) IsPVCUsedByPods(key string) bool {
return s.usedPVCSet.Has(key)
}