mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-06-13 10:33:35 +00:00
Update to kube v1.17
Signed-off-by: Humble Chirammal <hchiramm@redhat.com>
This commit is contained in:
committed by
mergify[bot]
parent
327fcd1b1b
commit
3af1e26d7c
210
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/csi_volume_predicate.go
generated
vendored
210
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/csi_volume_predicate.go
generated
vendored
@ -19,83 +19,122 @@ package predicates
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
"k8s.io/apimachinery/pkg/util/rand"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
storagelisters "k8s.io/client-go/listers/storage/v1"
|
||||
csitrans "k8s.io/csi-translation-lib"
|
||||
"k8s.io/klog"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
||||
volumeutil "k8s.io/kubernetes/pkg/volume/util"
|
||||
)
|
||||
|
||||
// InTreeToCSITranslator contains methods required to check migratable status
|
||||
// and perform translations from InTree PV's to CSI
|
||||
type InTreeToCSITranslator interface {
|
||||
IsPVMigratable(pv *v1.PersistentVolume) bool
|
||||
IsMigratableIntreePluginByName(inTreePluginName string) bool
|
||||
GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
|
||||
GetCSINameFromInTreeName(pluginName string) (string, error)
|
||||
TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
|
||||
}
|
||||
|
||||
// CSIMaxVolumeLimitChecker defines predicate needed for counting CSI volumes
|
||||
type CSIMaxVolumeLimitChecker struct {
|
||||
pvInfo PersistentVolumeInfo
|
||||
pvcInfo PersistentVolumeClaimInfo
|
||||
scInfo StorageClassInfo
|
||||
csiNodeLister storagelisters.CSINodeLister
|
||||
pvLister corelisters.PersistentVolumeLister
|
||||
pvcLister corelisters.PersistentVolumeClaimLister
|
||||
scLister storagelisters.StorageClassLister
|
||||
|
||||
randomVolumeIDPrefix string
|
||||
|
||||
translator InTreeToCSITranslator
|
||||
}
|
||||
|
||||
// NewCSIMaxVolumeLimitPredicate returns a predicate for counting CSI volumes
|
||||
func NewCSIMaxVolumeLimitPredicate(
|
||||
pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo, scInfo StorageClassInfo) FitPredicate {
|
||||
csiNodeLister storagelisters.CSINodeLister, pvLister corelisters.PersistentVolumeLister, pvcLister corelisters.PersistentVolumeClaimLister, scLister storagelisters.StorageClassLister) FitPredicate {
|
||||
c := &CSIMaxVolumeLimitChecker{
|
||||
pvInfo: pvInfo,
|
||||
pvcInfo: pvcInfo,
|
||||
scInfo: scInfo,
|
||||
csiNodeLister: csiNodeLister,
|
||||
pvLister: pvLister,
|
||||
pvcLister: pvcLister,
|
||||
scLister: scLister,
|
||||
randomVolumeIDPrefix: rand.String(32),
|
||||
translator: csitrans.New(),
|
||||
}
|
||||
return c.attachableLimitPredicate
|
||||
}
|
||||
|
||||
func (c *CSIMaxVolumeLimitChecker) attachableLimitPredicate(
|
||||
pod *v1.Pod, meta PredicateMetadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
||||
|
||||
// if feature gate is disable we return
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.AttachVolumeLimit) {
|
||||
return true, nil, nil
|
||||
func getVolumeLimits(nodeInfo *schedulernodeinfo.NodeInfo, csiNode *storagev1.CSINode) map[v1.ResourceName]int64 {
|
||||
// TODO: stop getting values from Node object in v1.18
|
||||
nodeVolumeLimits := nodeInfo.VolumeLimits()
|
||||
if csiNode != nil {
|
||||
for i := range csiNode.Spec.Drivers {
|
||||
d := csiNode.Spec.Drivers[i]
|
||||
if d.Allocatable != nil && d.Allocatable.Count != nil {
|
||||
// TODO: drop GetCSIAttachLimitKey once we don't get values from Node object (v1.18)
|
||||
k := v1.ResourceName(volumeutil.GetCSIAttachLimitKey(d.Name))
|
||||
nodeVolumeLimits[k] = int64(*d.Allocatable.Count)
|
||||
}
|
||||
}
|
||||
}
|
||||
// If a pod doesn't have any volume attached to it, the predicate will always be true.
|
||||
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
|
||||
return nodeVolumeLimits
|
||||
}
|
||||
|
||||
func (c *CSIMaxVolumeLimitChecker) attachableLimitPredicate(
|
||||
pod *v1.Pod, meta Metadata, nodeInfo *schedulernodeinfo.NodeInfo) (bool, []PredicateFailureReason, error) {
|
||||
// If the new pod doesn't have any volume attached to it, the predicate will always be true
|
||||
if len(pod.Spec.Volumes) == 0 {
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
nodeVolumeLimits := nodeInfo.VolumeLimits()
|
||||
|
||||
// if node does not have volume limits this predicate should exit
|
||||
if len(nodeVolumeLimits) == 0 {
|
||||
return true, nil, nil
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
return false, nil, fmt.Errorf("node not found")
|
||||
}
|
||||
|
||||
// If CSINode doesn't exist, the predicate may read the limits from Node object
|
||||
csiNode, err := c.csiNodeLister.Get(node.Name)
|
||||
if err != nil {
|
||||
// TODO: return the error once CSINode is created by default (2 releases)
|
||||
klog.V(5).Infof("Could not get a CSINode object for the node: %v", err)
|
||||
}
|
||||
|
||||
// a map of unique volume name/csi volume handle and volume limit key
|
||||
newVolumes := make(map[string]string)
|
||||
if err := c.filterAttachableVolumes(pod.Spec.Volumes, pod.Namespace, newVolumes); err != nil {
|
||||
if err := c.filterAttachableVolumes(csiNode, pod.Spec.Volumes, pod.Namespace, newVolumes); err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
|
||||
// If the pod doesn't have any new CSI volumes, the predicate will always be true
|
||||
if len(newVolumes) == 0 {
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
// a map of unique volume name/csi volume handle and volume limit key
|
||||
// If the node doesn't have volume limits, the predicate will always be true
|
||||
nodeVolumeLimits := getVolumeLimits(nodeInfo, csiNode)
|
||||
if len(nodeVolumeLimits) == 0 {
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
attachedVolumes := make(map[string]string)
|
||||
for _, existingPod := range nodeInfo.Pods() {
|
||||
if err := c.filterAttachableVolumes(existingPod.Spec.Volumes, existingPod.Namespace, attachedVolumes); err != nil {
|
||||
if err := c.filterAttachableVolumes(csiNode, existingPod.Spec.Volumes, existingPod.Namespace, attachedVolumes); err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
newVolumeCount := map[string]int{}
|
||||
attachedVolumeCount := map[string]int{}
|
||||
|
||||
for volumeName, volumeLimitKey := range attachedVolumes {
|
||||
if _, ok := newVolumes[volumeName]; ok {
|
||||
delete(newVolumes, volumeName)
|
||||
for volumeUniqueName, volumeLimitKey := range attachedVolumes {
|
||||
if _, ok := newVolumes[volumeUniqueName]; ok {
|
||||
// Don't count single volume used in multiple pods more than once
|
||||
delete(newVolumes, volumeUniqueName)
|
||||
}
|
||||
attachedVolumeCount[volumeLimitKey]++
|
||||
}
|
||||
|
||||
newVolumeCount := map[string]int{}
|
||||
for _, volumeLimitKey := range newVolumes {
|
||||
newVolumeCount[volumeLimitKey]++
|
||||
}
|
||||
@ -114,8 +153,7 @@ func (c *CSIMaxVolumeLimitChecker) attachableLimitPredicate(
|
||||
}
|
||||
|
||||
func (c *CSIMaxVolumeLimitChecker) filterAttachableVolumes(
|
||||
volumes []v1.Volume, namespace string, result map[string]string) error {
|
||||
|
||||
csiNode *storagev1.CSINode, volumes []v1.Volume, namespace string, result map[string]string) error {
|
||||
for _, vol := range volumes {
|
||||
// CSI volumes can only be used as persistent volumes
|
||||
if vol.PersistentVolumeClaim == nil {
|
||||
@ -127,77 +165,121 @@ func (c *CSIMaxVolumeLimitChecker) filterAttachableVolumes(
|
||||
return fmt.Errorf("PersistentVolumeClaim had no name")
|
||||
}
|
||||
|
||||
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
|
||||
pvc, err := c.pvcLister.PersistentVolumeClaims(namespace).Get(pvcName)
|
||||
|
||||
if err != nil {
|
||||
klog.V(4).Infof("Unable to look up PVC info for %s/%s", namespace, pvcName)
|
||||
klog.V(5).Infof("Unable to look up PVC info for %s/%s", namespace, pvcName)
|
||||
continue
|
||||
}
|
||||
|
||||
driverName, volumeHandle := c.getCSIDriver(pvc)
|
||||
// if we can't find driver name or volume handle - we don't count this volume.
|
||||
driverName, volumeHandle := c.getCSIDriverInfo(csiNode, pvc)
|
||||
if driverName == "" || volumeHandle == "" {
|
||||
klog.V(5).Infof("Could not find a CSI driver name or volume handle, not counting volume")
|
||||
continue
|
||||
}
|
||||
volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName)
|
||||
result[volumeHandle] = volumeLimitKey
|
||||
|
||||
volumeUniqueName := fmt.Sprintf("%s/%s", driverName, volumeHandle)
|
||||
volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName)
|
||||
result[volumeUniqueName] = volumeLimitKey
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *CSIMaxVolumeLimitChecker) getCSIDriver(pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
// getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC.
|
||||
// If the PVC is from a migrated in-tree plugin, this function will return
|
||||
// the information of the CSI driver that the plugin has been migrated to.
|
||||
func (c *CSIMaxVolumeLimitChecker) getCSIDriverInfo(csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
pvName := pvc.Spec.VolumeName
|
||||
namespace := pvc.Namespace
|
||||
pvcName := pvc.Name
|
||||
|
||||
placeHolderCSIDriver := ""
|
||||
placeHolderHandle := ""
|
||||
if pvName == "" {
|
||||
klog.V(5).Infof("Persistent volume had no name for claim %s/%s", namespace, pvcName)
|
||||
return c.getDriverNameFromSC(pvc)
|
||||
return c.getCSIDriverInfoFromSC(csiNode, pvc)
|
||||
}
|
||||
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
|
||||
|
||||
pv, err := c.pvLister.Get(pvName)
|
||||
if err != nil {
|
||||
klog.V(4).Infof("Unable to look up PV info for PVC %s/%s and PV %s", namespace, pvcName, pvName)
|
||||
klog.V(5).Infof("Unable to look up PV info for PVC %s/%s and PV %s", namespace, pvcName, pvName)
|
||||
// If we can't fetch PV associated with PVC, may be it got deleted
|
||||
// or PVC was prebound to a PVC that hasn't been created yet.
|
||||
// fallback to using StorageClass for volume counting
|
||||
return c.getDriverNameFromSC(pvc)
|
||||
return c.getCSIDriverInfoFromSC(csiNode, pvc)
|
||||
}
|
||||
|
||||
csiSource := pv.Spec.PersistentVolumeSource.CSI
|
||||
if csiSource == nil {
|
||||
klog.V(5).Infof("Not considering non-CSI volume %s/%s", namespace, pvcName)
|
||||
return placeHolderCSIDriver, placeHolderHandle
|
||||
// We make a fast path for non-CSI volumes that aren't migratable
|
||||
if !c.translator.IsPVMigratable(pv) {
|
||||
return "", ""
|
||||
}
|
||||
|
||||
pluginName, err := c.translator.GetInTreePluginNameFromSpec(pv, nil)
|
||||
if err != nil {
|
||||
klog.V(5).Infof("Unable to look up plugin name from PV spec: %v", err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if !isCSIMigrationOn(csiNode, pluginName) {
|
||||
klog.V(5).Infof("CSI Migration of plugin %s is not enabled", pluginName)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
csiPV, err := c.translator.TranslateInTreePVToCSI(pv)
|
||||
if err != nil {
|
||||
klog.V(5).Infof("Unable to translate in-tree volume to CSI: %v", err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
if csiPV.Spec.PersistentVolumeSource.CSI == nil {
|
||||
klog.V(5).Infof("Unable to get a valid volume source for translated PV %s", pvName)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
csiSource = csiPV.Spec.PersistentVolumeSource.CSI
|
||||
}
|
||||
|
||||
return csiSource.Driver, csiSource.VolumeHandle
|
||||
}
|
||||
|
||||
func (c *CSIMaxVolumeLimitChecker) getDriverNameFromSC(pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
// getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass.
|
||||
func (c *CSIMaxVolumeLimitChecker) getCSIDriverInfoFromSC(csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
|
||||
namespace := pvc.Namespace
|
||||
pvcName := pvc.Name
|
||||
scName := pvc.Spec.StorageClassName
|
||||
scName := v1helper.GetPersistentVolumeClaimClass(pvc)
|
||||
|
||||
placeHolderCSIDriver := ""
|
||||
placeHolderHandle := ""
|
||||
if scName == nil {
|
||||
// if StorageClass is not set or found, then PVC must be using immediate binding mode
|
||||
// and hence it must be bound before scheduling. So it is safe to not count it.
|
||||
klog.V(5).Infof("pvc %s/%s has no storageClass", namespace, pvcName)
|
||||
return placeHolderCSIDriver, placeHolderHandle
|
||||
// If StorageClass is not set or not found, then PVC must be using immediate binding mode
|
||||
// and hence it must be bound before scheduling. So it is safe to not count it.
|
||||
if scName == "" {
|
||||
klog.V(5).Infof("PVC %s/%s has no StorageClass", namespace, pvcName)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
storageClass, err := c.scInfo.GetStorageClassInfo(*scName)
|
||||
storageClass, err := c.scLister.Get(scName)
|
||||
if err != nil {
|
||||
klog.V(5).Infof("no storage %s found for pvc %s/%s", *scName, namespace, pvcName)
|
||||
return placeHolderCSIDriver, placeHolderHandle
|
||||
klog.V(5).Infof("Could not get StorageClass for PVC %s/%s: %v", namespace, pvcName, err)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
// We use random prefix to avoid conflict with volume-ids. If PVC is bound in the middle
|
||||
// predicate and there is another pod(on same node) that uses same volume then we will overcount
|
||||
// We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the
|
||||
// predicate and there is another pod on the same node that uses same volume, then we will overcount
|
||||
// the volume and consider both volumes as different.
|
||||
volumeHandle := fmt.Sprintf("%s-%s/%s", c.randomVolumeIDPrefix, namespace, pvcName)
|
||||
return storageClass.Provisioner, volumeHandle
|
||||
|
||||
provisioner := storageClass.Provisioner
|
||||
if c.translator.IsMigratableIntreePluginByName(provisioner) {
|
||||
if !isCSIMigrationOn(csiNode, provisioner) {
|
||||
klog.V(5).Infof("CSI Migration of plugin %s is not enabled", provisioner)
|
||||
return "", ""
|
||||
}
|
||||
|
||||
driverName, err := c.translator.GetCSINameFromInTreeName(provisioner)
|
||||
if err != nil {
|
||||
klog.V(5).Infof("Unable to look up driver name from plugin name: %v", err)
|
||||
return "", ""
|
||||
}
|
||||
return driverName, volumeHandle
|
||||
}
|
||||
|
||||
return provisioner, volumeHandle
|
||||
}
|
||||
|
34
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/error.go
generated
vendored
34
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/error.go
generated
vendored
@ -75,11 +75,45 @@ var (
|
||||
ErrVolumeNodeConflict = newPredicateFailureError("VolumeNodeAffinityConflict", "node(s) had volume node affinity conflict")
|
||||
// ErrVolumeBindConflict is used for VolumeBindingNoMatch predicate error.
|
||||
ErrVolumeBindConflict = newPredicateFailureError("VolumeBindingNoMatch", "node(s) didn't find available persistent volumes to bind")
|
||||
// ErrTopologySpreadConstraintsNotMatch is used for EvenPodsSpread predicate error.
|
||||
ErrTopologySpreadConstraintsNotMatch = newPredicateFailureError("EvenPodsSpreadNotMatch", "node(s) didn't match pod topology spread constraints")
|
||||
// ErrFakePredicate is used for test only. The fake predicates returning false also returns error
|
||||
// as ErrFakePredicate.
|
||||
ErrFakePredicate = newPredicateFailureError("FakePredicateError", "Nodes failed the fake predicate")
|
||||
)
|
||||
|
||||
var unresolvablePredicateFailureErrors = map[PredicateFailureReason]struct{}{
|
||||
ErrNodeSelectorNotMatch: {},
|
||||
ErrPodAffinityRulesNotMatch: {},
|
||||
ErrPodNotMatchHostName: {},
|
||||
ErrTaintsTolerationsNotMatch: {},
|
||||
ErrNodeLabelPresenceViolated: {},
|
||||
// Node conditions won't change when scheduler simulates removal of preemption victims.
|
||||
// So, it is pointless to try nodes that have not been able to host the pod due to node
|
||||
// conditions. These include ErrNodeNotReady, ErrNodeUnderPIDPressure, ErrNodeUnderMemoryPressure, ....
|
||||
ErrNodeNotReady: {},
|
||||
ErrNodeNetworkUnavailable: {},
|
||||
ErrNodeUnderDiskPressure: {},
|
||||
ErrNodeUnderPIDPressure: {},
|
||||
ErrNodeUnderMemoryPressure: {},
|
||||
ErrNodeUnschedulable: {},
|
||||
ErrNodeUnknownCondition: {},
|
||||
ErrVolumeZoneConflict: {},
|
||||
ErrVolumeNodeConflict: {},
|
||||
ErrVolumeBindConflict: {},
|
||||
}
|
||||
|
||||
// UnresolvablePredicateExists checks if there is at least one unresolvable predicate failure reason, if true
|
||||
// returns the first one in the list.
|
||||
func UnresolvablePredicateExists(reasons []PredicateFailureReason) PredicateFailureReason {
|
||||
for _, r := range reasons {
|
||||
if _, ok := unresolvablePredicateFailureErrors[r]; ok {
|
||||
return r
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// InsufficientResourceError is an error type that indicates what kind of resource limit is
|
||||
// hit and caused the unfitting failure.
|
||||
type InsufficientResourceError struct {
|
||||
|
753
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/metadata.go
generated
vendored
753
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/metadata.go
generated
vendored
@ -19,35 +19,31 @@ package predicates
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
"k8s.io/klog"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
"k8s.io/kubernetes/pkg/scheduler/algorithm"
|
||||
priorityutil "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util"
|
||||
schedulerlisters "k8s.io/kubernetes/pkg/scheduler/listers"
|
||||
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
||||
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
||||
)
|
||||
|
||||
// PredicateMetadata interface represents anything that can access a predicate metadata.
|
||||
type PredicateMetadata interface {
|
||||
ShallowCopy() PredicateMetadata
|
||||
AddPod(addedPod *v1.Pod, nodeInfo *schedulernodeinfo.NodeInfo) error
|
||||
RemovePod(deletedPod *v1.Pod) error
|
||||
// Metadata interface represents anything that can access a predicate metadata.
|
||||
type Metadata interface {
|
||||
ShallowCopy() Metadata
|
||||
AddPod(addedPod *v1.Pod, node *v1.Node) error
|
||||
RemovePod(deletedPod *v1.Pod, node *v1.Node) error
|
||||
}
|
||||
|
||||
// PredicateMetadataProducer is a function that computes predicate metadata for a given pod.
|
||||
type PredicateMetadataProducer func(pod *v1.Pod, nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo) PredicateMetadata
|
||||
|
||||
// PredicateMetadataFactory defines a factory of predicate metadata.
|
||||
type PredicateMetadataFactory struct {
|
||||
podLister algorithm.PodLister
|
||||
}
|
||||
// MetadataProducer is a function that computes predicate metadata for a given pod.
|
||||
type MetadataProducer func(pod *v1.Pod, sharedLister schedulerlisters.SharedLister) Metadata
|
||||
|
||||
// AntiAffinityTerm's topology key value used in predicate metadata
|
||||
type topologyPair struct {
|
||||
@ -66,14 +62,130 @@ type topologyPairsMaps struct {
|
||||
podToTopologyPairs map[string]topologyPairSet
|
||||
}
|
||||
|
||||
// NOTE: When new fields are added/removed or logic is changed, please make sure that
|
||||
// RemovePod, AddPod, and ShallowCopy functions are updated to work with the new changes.
|
||||
type predicateMetadata struct {
|
||||
pod *v1.Pod
|
||||
podBestEffort bool
|
||||
podRequest *schedulernodeinfo.Resource
|
||||
podPorts []*v1.ContainerPort
|
||||
type criticalPath struct {
|
||||
// topologyValue denotes the topology value mapping to topology key.
|
||||
topologyValue string
|
||||
// matchNum denotes the number of matching pods.
|
||||
matchNum int32
|
||||
}
|
||||
|
||||
// CAVEAT: the reason that `[2]criticalPath` can work is based on the implementation of current
|
||||
// preemption algorithm, in particular the following 2 facts:
|
||||
// Fact 1: we only preempt pods on the same node, instead of pods on multiple nodes.
|
||||
// Fact 2: each node is evaluated on a separate copy of the metadata during its preemption cycle.
|
||||
// If we plan to turn to a more complex algorithm like "arbitrary pods on multiple nodes", this
|
||||
// structure needs to be revisited.
|
||||
type criticalPaths [2]criticalPath
|
||||
|
||||
func newCriticalPaths() *criticalPaths {
|
||||
return &criticalPaths{{matchNum: math.MaxInt32}, {matchNum: math.MaxInt32}}
|
||||
}
|
||||
|
||||
func (paths *criticalPaths) update(tpVal string, num int32) {
|
||||
// first verify if `tpVal` exists or not
|
||||
i := -1
|
||||
if tpVal == paths[0].topologyValue {
|
||||
i = 0
|
||||
} else if tpVal == paths[1].topologyValue {
|
||||
i = 1
|
||||
}
|
||||
|
||||
if i >= 0 {
|
||||
// `tpVal` exists
|
||||
paths[i].matchNum = num
|
||||
if paths[0].matchNum > paths[1].matchNum {
|
||||
// swap paths[0] and paths[1]
|
||||
paths[0], paths[1] = paths[1], paths[0]
|
||||
}
|
||||
} else {
|
||||
// `tpVal` doesn't exist
|
||||
if num < paths[0].matchNum {
|
||||
// update paths[1] with paths[0]
|
||||
paths[1] = paths[0]
|
||||
// update paths[0]
|
||||
paths[0].topologyValue, paths[0].matchNum = tpVal, num
|
||||
} else if num < paths[1].matchNum {
|
||||
// update paths[1]
|
||||
paths[1].topologyValue, paths[1].matchNum = tpVal, num
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evenPodsSpreadMetadata combines tpKeyToCriticalPaths and tpPairToMatchNum
|
||||
// to represent:
|
||||
// (1) critical paths where the least pods are matched on each spread constraint.
|
||||
// (2) number of pods matched on each spread constraint.
|
||||
type evenPodsSpreadMetadata struct {
|
||||
constraints []topologySpreadConstraint
|
||||
// We record 2 critical paths instead of all critical paths here.
|
||||
// criticalPaths[0].matchNum always holds the minimum matching number.
|
||||
// criticalPaths[1].matchNum is always greater or equal to criticalPaths[0].matchNum, but
|
||||
// it's not guaranteed to be the 2nd minimum match number.
|
||||
tpKeyToCriticalPaths map[string]*criticalPaths
|
||||
// tpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
|
||||
tpPairToMatchNum map[topologyPair]int32
|
||||
}
|
||||
|
||||
// topologySpreadConstraint is an internal version for a hard (DoNotSchedule
|
||||
// unsatisfiable constraint action) v1.TopologySpreadConstraint and where the
|
||||
// selector is parsed.
|
||||
type topologySpreadConstraint struct {
|
||||
maxSkew int32
|
||||
topologyKey string
|
||||
selector labels.Selector
|
||||
}
|
||||
|
||||
type serviceAffinityMetadata struct {
|
||||
matchingPodList []*v1.Pod
|
||||
matchingPodServices []*v1.Service
|
||||
}
|
||||
|
||||
func (m *serviceAffinityMetadata) addPod(addedPod *v1.Pod, pod *v1.Pod, node *v1.Node) {
|
||||
// If addedPod is in the same namespace as the pod, update the list
|
||||
// of matching pods if applicable.
|
||||
if m == nil || addedPod.Namespace != pod.Namespace {
|
||||
return
|
||||
}
|
||||
|
||||
selector := CreateSelectorFromLabels(pod.Labels)
|
||||
if selector.Matches(labels.Set(addedPod.Labels)) {
|
||||
m.matchingPodList = append(m.matchingPodList, addedPod)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *serviceAffinityMetadata) removePod(deletedPod *v1.Pod, node *v1.Node) {
|
||||
deletedPodFullName := schedutil.GetPodFullName(deletedPod)
|
||||
|
||||
if m == nil ||
|
||||
len(m.matchingPodList) == 0 ||
|
||||
deletedPod.Namespace != m.matchingPodList[0].Namespace {
|
||||
return
|
||||
}
|
||||
|
||||
for i, pod := range m.matchingPodList {
|
||||
if schedutil.GetPodFullName(pod) == deletedPodFullName {
|
||||
m.matchingPodList = append(m.matchingPodList[:i], m.matchingPodList[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *serviceAffinityMetadata) clone() *serviceAffinityMetadata {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := serviceAffinityMetadata{}
|
||||
|
||||
copy.matchingPodServices = append([]*v1.Service(nil),
|
||||
m.matchingPodServices...)
|
||||
copy.matchingPodList = append([]*v1.Pod(nil),
|
||||
m.matchingPodList...)
|
||||
|
||||
return ©
|
||||
}
|
||||
|
||||
type podAffinityMetadata struct {
|
||||
topologyPairsAntiAffinityPodsMap *topologyPairsMaps
|
||||
// A map of topology pairs to a list of Pods that can potentially match
|
||||
// the affinity terms of the "pod" and its inverse.
|
||||
@ -81,9 +193,70 @@ type predicateMetadata struct {
|
||||
// A map of topology pairs to a list of Pods that can potentially match
|
||||
// the anti-affinity terms of the "pod" and its inverse.
|
||||
topologyPairsPotentialAntiAffinityPods *topologyPairsMaps
|
||||
serviceAffinityInUse bool
|
||||
serviceAffinityMatchingPodList []*v1.Pod
|
||||
serviceAffinityMatchingPodServices []*v1.Service
|
||||
}
|
||||
|
||||
func (m *podAffinityMetadata) addPod(addedPod *v1.Pod, pod *v1.Pod, node *v1.Node) error {
|
||||
// Add matching anti-affinity terms of the addedPod to the map.
|
||||
topologyPairsMaps, err := getMatchingAntiAffinityTopologyPairsOfPod(pod, addedPod, node)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.topologyPairsAntiAffinityPodsMap.appendMaps(topologyPairsMaps)
|
||||
// Add the pod to nodeNameToMatchingAffinityPods and nodeNameToMatchingAntiAffinityPods if needed.
|
||||
affinity := pod.Spec.Affinity
|
||||
podNodeName := addedPod.Spec.NodeName
|
||||
if affinity != nil && len(podNodeName) > 0 {
|
||||
// It is assumed that when the added pod matches affinity of the pod, all the terms must match,
|
||||
// this should be changed when the implementation of targetPodMatchesAffinityOfPod/podMatchesAffinityTermProperties
|
||||
// is changed
|
||||
if targetPodMatchesAffinityOfPod(pod, addedPod) {
|
||||
affinityTerms := GetPodAffinityTerms(affinity.PodAffinity)
|
||||
for _, term := range affinityTerms {
|
||||
if topologyValue, ok := node.Labels[term.TopologyKey]; ok {
|
||||
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
m.topologyPairsPotentialAffinityPods.addTopologyPair(pair, addedPod)
|
||||
}
|
||||
}
|
||||
}
|
||||
if targetPodMatchesAntiAffinityOfPod(pod, addedPod) {
|
||||
antiAffinityTerms := GetPodAntiAffinityTerms(affinity.PodAntiAffinity)
|
||||
for _, term := range antiAffinityTerms {
|
||||
if topologyValue, ok := node.Labels[term.TopologyKey]; ok {
|
||||
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
m.topologyPairsPotentialAntiAffinityPods.addTopologyPair(pair, addedPod)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *podAffinityMetadata) removePod(deletedPod *v1.Pod) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.topologyPairsAntiAffinityPodsMap.removePod(deletedPod)
|
||||
// Delete pod from the matching affinity or anti-affinity topology pairs maps.
|
||||
m.topologyPairsPotentialAffinityPods.removePod(deletedPod)
|
||||
m.topologyPairsPotentialAntiAffinityPods.removePod(deletedPod)
|
||||
}
|
||||
|
||||
func (m *podAffinityMetadata) clone() *podAffinityMetadata {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := podAffinityMetadata{}
|
||||
copy.topologyPairsPotentialAffinityPods = m.topologyPairsPotentialAffinityPods.clone()
|
||||
copy.topologyPairsPotentialAntiAffinityPods = m.topologyPairsPotentialAntiAffinityPods.clone()
|
||||
copy.topologyPairsAntiAffinityPodsMap = m.topologyPairsAntiAffinityPodsMap.clone()
|
||||
|
||||
return ©
|
||||
}
|
||||
|
||||
type podFitsResourcesMetadata struct {
|
||||
// ignoredExtendedResources is a set of extended resource names that will
|
||||
// be ignored in the PodFitsResources predicate.
|
||||
//
|
||||
@ -91,72 +264,129 @@ type predicateMetadata struct {
|
||||
// which should be accounted only by the extenders. This set is synthesized
|
||||
// from scheduler extender configuration and does not change per pod.
|
||||
ignoredExtendedResources sets.String
|
||||
podRequest *schedulernodeinfo.Resource
|
||||
}
|
||||
|
||||
// Ensure that predicateMetadata implements algorithm.PredicateMetadata.
|
||||
var _ PredicateMetadata = &predicateMetadata{}
|
||||
func (m *podFitsResourcesMetadata) clone() *podFitsResourcesMetadata {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := podFitsResourcesMetadata{}
|
||||
copy.ignoredExtendedResources = m.ignoredExtendedResources
|
||||
copy.podRequest = m.podRequest
|
||||
|
||||
return ©
|
||||
}
|
||||
|
||||
type podFitsHostPortsMetadata struct {
|
||||
podPorts []*v1.ContainerPort
|
||||
}
|
||||
|
||||
func (m *podFitsHostPortsMetadata) clone() *podFitsHostPortsMetadata {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := podFitsHostPortsMetadata{}
|
||||
copy.podPorts = append([]*v1.ContainerPort(nil), m.podPorts...)
|
||||
|
||||
return ©
|
||||
}
|
||||
|
||||
// NOTE: When new fields are added/removed or logic is changed, please make sure that
|
||||
// RemovePod, AddPod, and ShallowCopy functions are updated to work with the new changes.
|
||||
type predicateMetadata struct {
|
||||
pod *v1.Pod
|
||||
podBestEffort bool
|
||||
|
||||
// evenPodsSpreadMetadata holds info of the minimum match number on each topology spread constraint,
|
||||
// and the match number of all valid topology pairs.
|
||||
evenPodsSpreadMetadata *evenPodsSpreadMetadata
|
||||
|
||||
serviceAffinityMetadata *serviceAffinityMetadata
|
||||
podAffinityMetadata *podAffinityMetadata
|
||||
podFitsResourcesMetadata *podFitsResourcesMetadata
|
||||
podFitsHostPortsMetadata *podFitsHostPortsMetadata
|
||||
}
|
||||
|
||||
// Ensure that predicateMetadata implements algorithm.Metadata.
|
||||
var _ Metadata = &predicateMetadata{}
|
||||
|
||||
// predicateMetadataProducer function produces predicate metadata. It is stored in a global variable below
|
||||
// and used to modify the return values of PredicateMetadataProducer
|
||||
// and used to modify the return values of MetadataProducer
|
||||
type predicateMetadataProducer func(pm *predicateMetadata)
|
||||
|
||||
var predicateMetadataProducers = make(map[string]predicateMetadataProducer)
|
||||
|
||||
// RegisterPredicateMetadataProducer registers a PredicateMetadataProducer.
|
||||
// RegisterPredicateMetadataProducer registers a MetadataProducer.
|
||||
func RegisterPredicateMetadataProducer(predicateName string, precomp predicateMetadataProducer) {
|
||||
predicateMetadataProducers[predicateName] = precomp
|
||||
}
|
||||
|
||||
// EmptyPredicateMetadataProducer returns a no-op MetadataProducer type.
|
||||
func EmptyPredicateMetadataProducer(pod *v1.Pod, nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo) PredicateMetadata {
|
||||
// EmptyMetadataProducer returns a no-op MetadataProducer type.
|
||||
func EmptyMetadataProducer(pod *v1.Pod, sharedLister schedulerlisters.SharedLister) Metadata {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegisterPredicateMetadataProducerWithExtendedResourceOptions registers a
|
||||
// PredicateMetadataProducer that creates predicate metadata with the provided
|
||||
// MetadataProducer that creates predicate metadata with the provided
|
||||
// options for extended resources.
|
||||
//
|
||||
// See the comments in "predicateMetadata" for the explanation of the options.
|
||||
func RegisterPredicateMetadataProducerWithExtendedResourceOptions(ignoredExtendedResources sets.String) {
|
||||
RegisterPredicateMetadataProducer("PredicateWithExtendedResourceOptions", func(pm *predicateMetadata) {
|
||||
pm.ignoredExtendedResources = ignoredExtendedResources
|
||||
pm.podFitsResourcesMetadata.ignoredExtendedResources = ignoredExtendedResources
|
||||
})
|
||||
}
|
||||
|
||||
// NewPredicateMetadataFactory creates a PredicateMetadataFactory.
|
||||
func NewPredicateMetadataFactory(podLister algorithm.PodLister) PredicateMetadataProducer {
|
||||
factory := &PredicateMetadataFactory{
|
||||
podLister,
|
||||
}
|
||||
return factory.GetMetadata
|
||||
}
|
||||
// MetadataProducerFactory is a factory to produce Metadata.
|
||||
type MetadataProducerFactory struct{}
|
||||
|
||||
// GetMetadata returns the predicateMetadata used which will be used by various predicates.
|
||||
func (pfactory *PredicateMetadataFactory) GetMetadata(pod *v1.Pod, nodeNameToInfoMap map[string]*schedulernodeinfo.NodeInfo) PredicateMetadata {
|
||||
// GetPredicateMetadata returns the predicateMetadata which will be used by various predicates.
|
||||
func (f *MetadataProducerFactory) GetPredicateMetadata(pod *v1.Pod, sharedLister schedulerlisters.SharedLister) Metadata {
|
||||
// If we cannot compute metadata, just return nil
|
||||
if pod == nil {
|
||||
return nil
|
||||
}
|
||||
// existingPodAntiAffinityMap will be used later for efficient check on existing pods' anti-affinity
|
||||
existingPodAntiAffinityMap, err := getTPMapMatchingExistingAntiAffinity(pod, nodeNameToInfoMap)
|
||||
|
||||
var allNodes []*schedulernodeinfo.NodeInfo
|
||||
var havePodsWithAffinityNodes []*schedulernodeinfo.NodeInfo
|
||||
if sharedLister != nil {
|
||||
var err error
|
||||
allNodes, err = sharedLister.NodeInfos().List()
|
||||
if err != nil {
|
||||
klog.Errorf("failed to list NodeInfos: %v", err)
|
||||
return nil
|
||||
}
|
||||
havePodsWithAffinityNodes, err = sharedLister.NodeInfos().HavePodsWithAffinityList()
|
||||
if err != nil {
|
||||
klog.Errorf("failed to list NodeInfos: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// evenPodsSpreadMetadata represents how existing pods match "pod"
|
||||
// on its spread constraints
|
||||
evenPodsSpreadMetadata, err := getEvenPodsSpreadMetadata(pod, allNodes)
|
||||
if err != nil {
|
||||
klog.Errorf("Error calculating spreadConstraintsMap: %v", err)
|
||||
return nil
|
||||
}
|
||||
// incomingPodAffinityMap will be used later for efficient check on incoming pod's affinity
|
||||
// incomingPodAntiAffinityMap will be used later for efficient check on incoming pod's anti-affinity
|
||||
incomingPodAffinityMap, incomingPodAntiAffinityMap, err := getTPMapMatchingIncomingAffinityAntiAffinity(pod, nodeNameToInfoMap)
|
||||
|
||||
podAffinityMetadata, err := getPodAffinityMetadata(pod, allNodes, havePodsWithAffinityNodes)
|
||||
if err != nil {
|
||||
klog.Errorf("[predicate meta data generation] error finding pods that match affinity terms: %v", err)
|
||||
klog.Errorf("Error calculating podAffinityMetadata: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
predicateMetadata := &predicateMetadata{
|
||||
pod: pod,
|
||||
podBestEffort: isPodBestEffort(pod),
|
||||
podRequest: GetResourceRequest(pod),
|
||||
podPorts: schedutil.GetContainerPorts(pod),
|
||||
topologyPairsPotentialAffinityPods: incomingPodAffinityMap,
|
||||
topologyPairsPotentialAntiAffinityPods: incomingPodAntiAffinityMap,
|
||||
topologyPairsAntiAffinityPodsMap: existingPodAntiAffinityMap,
|
||||
pod: pod,
|
||||
evenPodsSpreadMetadata: evenPodsSpreadMetadata,
|
||||
podAffinityMetadata: podAffinityMetadata,
|
||||
podFitsResourcesMetadata: getPodFitsResourcesMetedata(pod),
|
||||
podFitsHostPortsMetadata: getPodFitsHostPortsMetadata(pod),
|
||||
}
|
||||
for predicateName, precomputeFunc := range predicateMetadataProducers {
|
||||
klog.V(10).Infof("Precompute: %v", predicateName)
|
||||
@ -165,152 +395,287 @@ func (pfactory *PredicateMetadataFactory) GetMetadata(pod *v1.Pod, nodeNameToInf
|
||||
return predicateMetadata
|
||||
}
|
||||
|
||||
// returns a pointer to a new topologyPairsMaps
|
||||
func newTopologyPairsMaps() *topologyPairsMaps {
|
||||
return &topologyPairsMaps{topologyPairToPods: make(map[topologyPair]podSet),
|
||||
podToTopologyPairs: make(map[string]topologyPairSet)}
|
||||
func getPodFitsHostPortsMetadata(pod *v1.Pod) *podFitsHostPortsMetadata {
|
||||
return &podFitsHostPortsMetadata{
|
||||
podPorts: schedutil.GetContainerPorts(pod),
|
||||
}
|
||||
}
|
||||
|
||||
func (topologyPairsMaps *topologyPairsMaps) addTopologyPair(pair topologyPair, pod *v1.Pod) {
|
||||
podFullName := schedutil.GetPodFullName(pod)
|
||||
if topologyPairsMaps.topologyPairToPods[pair] == nil {
|
||||
topologyPairsMaps.topologyPairToPods[pair] = make(map[*v1.Pod]struct{})
|
||||
func getPodFitsResourcesMetedata(pod *v1.Pod) *podFitsResourcesMetadata {
|
||||
return &podFitsResourcesMetadata{
|
||||
podRequest: GetResourceRequest(pod),
|
||||
}
|
||||
topologyPairsMaps.topologyPairToPods[pair][pod] = struct{}{}
|
||||
if topologyPairsMaps.podToTopologyPairs[podFullName] == nil {
|
||||
topologyPairsMaps.podToTopologyPairs[podFullName] = make(map[topologyPair]struct{})
|
||||
}
|
||||
topologyPairsMaps.podToTopologyPairs[podFullName][pair] = struct{}{}
|
||||
}
|
||||
|
||||
func (topologyPairsMaps *topologyPairsMaps) removePod(deletedPod *v1.Pod) {
|
||||
deletedPodFullName := schedutil.GetPodFullName(deletedPod)
|
||||
for pair := range topologyPairsMaps.podToTopologyPairs[deletedPodFullName] {
|
||||
delete(topologyPairsMaps.topologyPairToPods[pair], deletedPod)
|
||||
if len(topologyPairsMaps.topologyPairToPods[pair]) == 0 {
|
||||
delete(topologyPairsMaps.topologyPairToPods, pair)
|
||||
func getPodAffinityMetadata(pod *v1.Pod, allNodes []*schedulernodeinfo.NodeInfo, havePodsWithAffinityNodes []*schedulernodeinfo.NodeInfo) (*podAffinityMetadata, error) {
|
||||
// existingPodAntiAffinityMap will be used later for efficient check on existing pods' anti-affinity
|
||||
existingPodAntiAffinityMap, err := getTPMapMatchingExistingAntiAffinity(pod, havePodsWithAffinityNodes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// incomingPodAffinityMap will be used later for efficient check on incoming pod's affinity
|
||||
// incomingPodAntiAffinityMap will be used later for efficient check on incoming pod's anti-affinity
|
||||
incomingPodAffinityMap, incomingPodAntiAffinityMap, err := getTPMapMatchingIncomingAffinityAntiAffinity(pod, allNodes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &podAffinityMetadata{
|
||||
topologyPairsPotentialAffinityPods: incomingPodAffinityMap,
|
||||
topologyPairsPotentialAntiAffinityPods: incomingPodAntiAffinityMap,
|
||||
topologyPairsAntiAffinityPodsMap: existingPodAntiAffinityMap,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getEvenPodsSpreadMetadata(pod *v1.Pod, allNodes []*schedulernodeinfo.NodeInfo) (*evenPodsSpreadMetadata, error) {
|
||||
// We have feature gating in APIServer to strip the spec
|
||||
// so don't need to re-check feature gate, just check length of constraints.
|
||||
constraints, err := filterHardTopologySpreadConstraints(pod.Spec.TopologySpreadConstraints)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(constraints) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var lock sync.Mutex
|
||||
|
||||
// TODO(Huang-Wei): It might be possible to use "make(map[topologyPair]*int32)".
|
||||
// In that case, need to consider how to init each tpPairToCount[pair] in an atomic fashion.
|
||||
m := evenPodsSpreadMetadata{
|
||||
constraints: constraints,
|
||||
tpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
|
||||
tpPairToMatchNum: make(map[topologyPair]int32),
|
||||
}
|
||||
addTopologyPairMatchNum := func(pair topologyPair, num int32) {
|
||||
lock.Lock()
|
||||
m.tpPairToMatchNum[pair] += num
|
||||
lock.Unlock()
|
||||
}
|
||||
|
||||
processNode := func(i int) {
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
klog.Error("node not found")
|
||||
return
|
||||
}
|
||||
// In accordance to design, if NodeAffinity or NodeSelector is defined,
|
||||
// spreading is applied to nodes that pass those filters.
|
||||
if !PodMatchesNodeSelectorAndAffinityTerms(pod, node) {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure current node's labels contains all topologyKeys in 'constraints'.
|
||||
if !NodeLabelsMatchSpreadConstraints(node.Labels, constraints) {
|
||||
return
|
||||
}
|
||||
for _, constraint := range constraints {
|
||||
matchTotal := int32(0)
|
||||
// nodeInfo.Pods() can be empty; or all pods don't fit
|
||||
for _, existingPod := range nodeInfo.Pods() {
|
||||
if existingPod.Namespace != pod.Namespace {
|
||||
continue
|
||||
}
|
||||
if constraint.selector.Matches(labels.Set(existingPod.Labels)) {
|
||||
matchTotal++
|
||||
}
|
||||
}
|
||||
pair := topologyPair{key: constraint.topologyKey, value: node.Labels[constraint.topologyKey]}
|
||||
addTopologyPairMatchNum(pair, matchTotal)
|
||||
}
|
||||
}
|
||||
delete(topologyPairsMaps.podToTopologyPairs, deletedPodFullName)
|
||||
workqueue.ParallelizeUntil(context.Background(), 16, len(allNodes), processNode)
|
||||
|
||||
// calculate min match for each topology pair
|
||||
for i := 0; i < len(constraints); i++ {
|
||||
key := constraints[i].topologyKey
|
||||
m.tpKeyToCriticalPaths[key] = newCriticalPaths()
|
||||
}
|
||||
for pair, num := range m.tpPairToMatchNum {
|
||||
m.tpKeyToCriticalPaths[pair.key].update(pair.value, num)
|
||||
}
|
||||
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func (topologyPairsMaps *topologyPairsMaps) appendMaps(toAppend *topologyPairsMaps) {
|
||||
func filterHardTopologySpreadConstraints(constraints []v1.TopologySpreadConstraint) ([]topologySpreadConstraint, error) {
|
||||
var result []topologySpreadConstraint
|
||||
for _, c := range constraints {
|
||||
if c.WhenUnsatisfiable == v1.DoNotSchedule {
|
||||
selector, err := metav1.LabelSelectorAsSelector(c.LabelSelector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result = append(result, topologySpreadConstraint{
|
||||
maxSkew: c.MaxSkew,
|
||||
topologyKey: c.TopologyKey,
|
||||
selector: selector,
|
||||
})
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// NodeLabelsMatchSpreadConstraints checks if ALL topology keys in spread constraints are present in node labels.
|
||||
func NodeLabelsMatchSpreadConstraints(nodeLabels map[string]string, constraints []topologySpreadConstraint) bool {
|
||||
for _, c := range constraints {
|
||||
if _, ok := nodeLabels[c.topologyKey]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// returns a pointer to a new topologyPairsMaps
|
||||
func newTopologyPairsMaps() *topologyPairsMaps {
|
||||
return &topologyPairsMaps{
|
||||
topologyPairToPods: make(map[topologyPair]podSet),
|
||||
podToTopologyPairs: make(map[string]topologyPairSet),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *topologyPairsMaps) addTopologyPair(pair topologyPair, pod *v1.Pod) {
|
||||
podFullName := schedutil.GetPodFullName(pod)
|
||||
if m.topologyPairToPods[pair] == nil {
|
||||
m.topologyPairToPods[pair] = make(map[*v1.Pod]struct{})
|
||||
}
|
||||
m.topologyPairToPods[pair][pod] = struct{}{}
|
||||
if m.podToTopologyPairs[podFullName] == nil {
|
||||
m.podToTopologyPairs[podFullName] = make(map[topologyPair]struct{})
|
||||
}
|
||||
m.podToTopologyPairs[podFullName][pair] = struct{}{}
|
||||
}
|
||||
|
||||
func (m *topologyPairsMaps) removePod(deletedPod *v1.Pod) {
|
||||
deletedPodFullName := schedutil.GetPodFullName(deletedPod)
|
||||
for pair := range m.podToTopologyPairs[deletedPodFullName] {
|
||||
delete(m.topologyPairToPods[pair], deletedPod)
|
||||
if len(m.topologyPairToPods[pair]) == 0 {
|
||||
delete(m.topologyPairToPods, pair)
|
||||
}
|
||||
}
|
||||
delete(m.podToTopologyPairs, deletedPodFullName)
|
||||
}
|
||||
|
||||
func (m *topologyPairsMaps) appendMaps(toAppend *topologyPairsMaps) {
|
||||
if toAppend == nil {
|
||||
return
|
||||
}
|
||||
for pair := range toAppend.topologyPairToPods {
|
||||
for pod := range toAppend.topologyPairToPods[pair] {
|
||||
topologyPairsMaps.addTopologyPair(pair, pod)
|
||||
m.addTopologyPair(pair, pod)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *topologyPairsMaps) clone() *topologyPairsMaps {
|
||||
copy := newTopologyPairsMaps()
|
||||
copy.appendMaps(m)
|
||||
return copy
|
||||
}
|
||||
|
||||
func (m *evenPodsSpreadMetadata) addPod(addedPod, preemptorPod *v1.Pod, node *v1.Node) {
|
||||
m.updatePod(addedPod, preemptorPod, node, 1)
|
||||
}
|
||||
|
||||
func (m *evenPodsSpreadMetadata) removePod(deletedPod, preemptorPod *v1.Pod, node *v1.Node) {
|
||||
m.updatePod(deletedPod, preemptorPod, node, -1)
|
||||
}
|
||||
|
||||
func (m *evenPodsSpreadMetadata) updatePod(updatedPod, preemptorPod *v1.Pod, node *v1.Node, delta int32) {
|
||||
if m == nil || updatedPod.Namespace != preemptorPod.Namespace || node == nil {
|
||||
return
|
||||
}
|
||||
if !NodeLabelsMatchSpreadConstraints(node.Labels, m.constraints) {
|
||||
return
|
||||
}
|
||||
|
||||
podLabelSet := labels.Set(updatedPod.Labels)
|
||||
for _, constraint := range m.constraints {
|
||||
if !constraint.selector.Matches(podLabelSet) {
|
||||
continue
|
||||
}
|
||||
|
||||
k, v := constraint.topologyKey, node.Labels[constraint.topologyKey]
|
||||
pair := topologyPair{key: k, value: v}
|
||||
m.tpPairToMatchNum[pair] = m.tpPairToMatchNum[pair] + delta
|
||||
|
||||
m.tpKeyToCriticalPaths[k].update(v, m.tpPairToMatchNum[pair])
|
||||
}
|
||||
}
|
||||
|
||||
func (m *evenPodsSpreadMetadata) clone() *evenPodsSpreadMetadata {
|
||||
// c could be nil when EvenPodsSpread feature is disabled
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
cp := evenPodsSpreadMetadata{
|
||||
// constraints are shared because they don't change.
|
||||
constraints: m.constraints,
|
||||
tpKeyToCriticalPaths: make(map[string]*criticalPaths, len(m.tpKeyToCriticalPaths)),
|
||||
tpPairToMatchNum: make(map[topologyPair]int32, len(m.tpPairToMatchNum)),
|
||||
}
|
||||
for tpKey, paths := range m.tpKeyToCriticalPaths {
|
||||
cp.tpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
|
||||
}
|
||||
for tpPair, matchNum := range m.tpPairToMatchNum {
|
||||
copyPair := topologyPair{key: tpPair.key, value: tpPair.value}
|
||||
cp.tpPairToMatchNum[copyPair] = matchNum
|
||||
}
|
||||
return &cp
|
||||
}
|
||||
|
||||
// RemovePod changes predicateMetadata assuming that the given `deletedPod` is
|
||||
// deleted from the system.
|
||||
func (meta *predicateMetadata) RemovePod(deletedPod *v1.Pod) error {
|
||||
func (meta *predicateMetadata) RemovePod(deletedPod *v1.Pod, node *v1.Node) error {
|
||||
deletedPodFullName := schedutil.GetPodFullName(deletedPod)
|
||||
if deletedPodFullName == schedutil.GetPodFullName(meta.pod) {
|
||||
return fmt.Errorf("deletedPod and meta.pod must not be the same")
|
||||
}
|
||||
meta.topologyPairsAntiAffinityPodsMap.removePod(deletedPod)
|
||||
// Delete pod from the matching affinity or anti-affinity topology pairs maps.
|
||||
meta.topologyPairsPotentialAffinityPods.removePod(deletedPod)
|
||||
meta.topologyPairsPotentialAntiAffinityPods.removePod(deletedPod)
|
||||
// All pods in the serviceAffinityMatchingPodList are in the same namespace.
|
||||
// So, if the namespace of the first one is not the same as the namespace of the
|
||||
// deletedPod, we don't need to check the list, as deletedPod isn't in the list.
|
||||
if meta.serviceAffinityInUse &&
|
||||
len(meta.serviceAffinityMatchingPodList) > 0 &&
|
||||
deletedPod.Namespace == meta.serviceAffinityMatchingPodList[0].Namespace {
|
||||
for i, pod := range meta.serviceAffinityMatchingPodList {
|
||||
if schedutil.GetPodFullName(pod) == deletedPodFullName {
|
||||
meta.serviceAffinityMatchingPodList = append(
|
||||
meta.serviceAffinityMatchingPodList[:i],
|
||||
meta.serviceAffinityMatchingPodList[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
meta.podAffinityMetadata.removePod(deletedPod)
|
||||
meta.evenPodsSpreadMetadata.removePod(deletedPod, meta.pod, node)
|
||||
meta.serviceAffinityMetadata.removePod(deletedPod, node)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddPod changes predicateMetadata assuming that `newPod` is added to the
|
||||
// AddPod changes predicateMetadata assuming that the given `addedPod` is added to the
|
||||
// system.
|
||||
func (meta *predicateMetadata) AddPod(addedPod *v1.Pod, nodeInfo *schedulernodeinfo.NodeInfo) error {
|
||||
func (meta *predicateMetadata) AddPod(addedPod *v1.Pod, node *v1.Node) error {
|
||||
addedPodFullName := schedutil.GetPodFullName(addedPod)
|
||||
if addedPodFullName == schedutil.GetPodFullName(meta.pod) {
|
||||
return fmt.Errorf("addedPod and meta.pod must not be the same")
|
||||
}
|
||||
if nodeInfo.Node() == nil {
|
||||
return fmt.Errorf("invalid node in nodeInfo")
|
||||
if node == nil {
|
||||
return fmt.Errorf("node not found")
|
||||
}
|
||||
// Add matching anti-affinity terms of the addedPod to the map.
|
||||
topologyPairsMaps, err := getMatchingAntiAffinityTopologyPairsOfPod(meta.pod, addedPod, nodeInfo.Node())
|
||||
if err != nil {
|
||||
|
||||
if err := meta.podAffinityMetadata.addPod(addedPod, meta.pod, node); err != nil {
|
||||
return err
|
||||
}
|
||||
meta.topologyPairsAntiAffinityPodsMap.appendMaps(topologyPairsMaps)
|
||||
// Add the pod to nodeNameToMatchingAffinityPods and nodeNameToMatchingAntiAffinityPods if needed.
|
||||
affinity := meta.pod.Spec.Affinity
|
||||
podNodeName := addedPod.Spec.NodeName
|
||||
if affinity != nil && len(podNodeName) > 0 {
|
||||
podNode := nodeInfo.Node()
|
||||
// It is assumed that when the added pod matches affinity of the meta.pod, all the terms must match,
|
||||
// this should be changed when the implementation of targetPodMatchesAffinityOfPod/podMatchesAffinityTermProperties
|
||||
// is changed
|
||||
if targetPodMatchesAffinityOfPod(meta.pod, addedPod) {
|
||||
affinityTerms := GetPodAffinityTerms(affinity.PodAffinity)
|
||||
for _, term := range affinityTerms {
|
||||
if topologyValue, ok := podNode.Labels[term.TopologyKey]; ok {
|
||||
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
meta.topologyPairsPotentialAffinityPods.addTopologyPair(pair, addedPod)
|
||||
}
|
||||
}
|
||||
}
|
||||
if targetPodMatchesAntiAffinityOfPod(meta.pod, addedPod) {
|
||||
antiAffinityTerms := GetPodAntiAffinityTerms(affinity.PodAntiAffinity)
|
||||
for _, term := range antiAffinityTerms {
|
||||
if topologyValue, ok := podNode.Labels[term.TopologyKey]; ok {
|
||||
pair := topologyPair{key: term.TopologyKey, value: topologyValue}
|
||||
meta.topologyPairsPotentialAntiAffinityPods.addTopologyPair(pair, addedPod)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// If addedPod is in the same namespace as the meta.pod, update the list
|
||||
// of matching pods if applicable.
|
||||
if meta.serviceAffinityInUse && addedPod.Namespace == meta.pod.Namespace {
|
||||
selector := CreateSelectorFromLabels(meta.pod.Labels)
|
||||
if selector.Matches(labels.Set(addedPod.Labels)) {
|
||||
meta.serviceAffinityMatchingPodList = append(meta.serviceAffinityMatchingPodList,
|
||||
addedPod)
|
||||
}
|
||||
}
|
||||
// Update meta.evenPodsSpreadMetadata if meta.pod has hard spread constraints
|
||||
// and addedPod matches that
|
||||
meta.evenPodsSpreadMetadata.addPod(addedPod, meta.pod, node)
|
||||
|
||||
meta.serviceAffinityMetadata.addPod(addedPod, meta.pod, node)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ShallowCopy copies a metadata struct into a new struct and creates a copy of
|
||||
// its maps and slices, but it does not copy the contents of pointer values.
|
||||
func (meta *predicateMetadata) ShallowCopy() PredicateMetadata {
|
||||
func (meta *predicateMetadata) ShallowCopy() Metadata {
|
||||
newPredMeta := &predicateMetadata{
|
||||
pod: meta.pod,
|
||||
podBestEffort: meta.podBestEffort,
|
||||
podRequest: meta.podRequest,
|
||||
serviceAffinityInUse: meta.serviceAffinityInUse,
|
||||
ignoredExtendedResources: meta.ignoredExtendedResources,
|
||||
pod: meta.pod,
|
||||
podBestEffort: meta.podBestEffort,
|
||||
}
|
||||
newPredMeta.podPorts = append([]*v1.ContainerPort(nil), meta.podPorts...)
|
||||
newPredMeta.topologyPairsPotentialAffinityPods = newTopologyPairsMaps()
|
||||
newPredMeta.topologyPairsPotentialAffinityPods.appendMaps(meta.topologyPairsPotentialAffinityPods)
|
||||
newPredMeta.topologyPairsPotentialAntiAffinityPods = newTopologyPairsMaps()
|
||||
newPredMeta.topologyPairsPotentialAntiAffinityPods.appendMaps(meta.topologyPairsPotentialAntiAffinityPods)
|
||||
newPredMeta.topologyPairsAntiAffinityPodsMap = newTopologyPairsMaps()
|
||||
newPredMeta.topologyPairsAntiAffinityPodsMap.appendMaps(meta.topologyPairsAntiAffinityPodsMap)
|
||||
newPredMeta.serviceAffinityMatchingPodServices = append([]*v1.Service(nil),
|
||||
meta.serviceAffinityMatchingPodServices...)
|
||||
newPredMeta.serviceAffinityMatchingPodList = append([]*v1.Pod(nil),
|
||||
meta.serviceAffinityMatchingPodList...)
|
||||
return (PredicateMetadata)(newPredMeta)
|
||||
newPredMeta.podFitsHostPortsMetadata = meta.podFitsHostPortsMetadata.clone()
|
||||
newPredMeta.podAffinityMetadata = meta.podAffinityMetadata.clone()
|
||||
newPredMeta.evenPodsSpreadMetadata = meta.evenPodsSpreadMetadata.clone()
|
||||
newPredMeta.serviceAffinityMetadata = meta.serviceAffinityMetadata.clone()
|
||||
newPredMeta.podFitsResourcesMetadata = meta.podFitsResourcesMetadata.clone()
|
||||
return (Metadata)(newPredMeta)
|
||||
}
|
||||
|
||||
type affinityTermProperties struct {
|
||||
@ -365,15 +730,9 @@ func podMatchesAnyAffinityTermProperties(pod *v1.Pod, properties []*affinityTerm
|
||||
// getTPMapMatchingExistingAntiAffinity calculates the following for each existing pod on each node:
|
||||
// (1) Whether it has PodAntiAffinity
|
||||
// (2) Whether any AffinityTerm matches the incoming pod
|
||||
func getTPMapMatchingExistingAntiAffinity(pod *v1.Pod, nodeInfoMap map[string]*schedulernodeinfo.NodeInfo) (*topologyPairsMaps, error) {
|
||||
allNodeNames := make([]string, 0, len(nodeInfoMap))
|
||||
for name := range nodeInfoMap {
|
||||
allNodeNames = append(allNodeNames, name)
|
||||
}
|
||||
|
||||
func getTPMapMatchingExistingAntiAffinity(pod *v1.Pod, allNodes []*schedulernodeinfo.NodeInfo) (*topologyPairsMaps, error) {
|
||||
errCh := schedutil.NewErrorChannel()
|
||||
var lock sync.Mutex
|
||||
var firstError error
|
||||
|
||||
topologyMaps := newTopologyPairsMaps()
|
||||
|
||||
appendTopologyPairsMaps := func(toAppend *topologyPairsMaps) {
|
||||
@ -381,54 +740,48 @@ func getTPMapMatchingExistingAntiAffinity(pod *v1.Pod, nodeInfoMap map[string]*s
|
||||
defer lock.Unlock()
|
||||
topologyMaps.appendMaps(toAppend)
|
||||
}
|
||||
catchError := func(err error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if firstError == nil {
|
||||
firstError = err
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
processNode := func(i int) {
|
||||
nodeInfo := nodeInfoMap[allNodeNames[i]]
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
catchError(fmt.Errorf("node not found"))
|
||||
klog.Error("node not found")
|
||||
return
|
||||
}
|
||||
for _, existingPod := range nodeInfo.PodsWithAffinity() {
|
||||
existingPodTopologyMaps, err := getMatchingAntiAffinityTopologyPairsOfPod(pod, existingPod, node)
|
||||
if err != nil {
|
||||
catchError(err)
|
||||
cancel()
|
||||
errCh.SendErrorWithCancel(err, cancel)
|
||||
return
|
||||
}
|
||||
appendTopologyPairsMaps(existingPodTopologyMaps)
|
||||
if existingPodTopologyMaps != nil {
|
||||
appendTopologyPairsMaps(existingPodTopologyMaps)
|
||||
}
|
||||
}
|
||||
}
|
||||
workqueue.ParallelizeUntil(ctx, 16, len(allNodeNames), processNode)
|
||||
return topologyMaps, firstError
|
||||
workqueue.ParallelizeUntil(ctx, 16, len(allNodes), processNode)
|
||||
|
||||
if err := errCh.ReceiveError(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return topologyMaps, nil
|
||||
}
|
||||
|
||||
// getTPMapMatchingIncomingAffinityAntiAffinity finds existing Pods that match affinity terms of the given "pod".
|
||||
// It returns a topologyPairsMaps that are checked later by the affinity
|
||||
// predicate. With this topologyPairsMaps available, the affinity predicate does not
|
||||
// need to check all the pods in the cluster.
|
||||
func getTPMapMatchingIncomingAffinityAntiAffinity(pod *v1.Pod, nodeInfoMap map[string]*schedulernodeinfo.NodeInfo) (topologyPairsAffinityPodsMaps *topologyPairsMaps, topologyPairsAntiAffinityPodsMaps *topologyPairsMaps, err error) {
|
||||
func getTPMapMatchingIncomingAffinityAntiAffinity(pod *v1.Pod, allNodes []*schedulernodeinfo.NodeInfo) (topologyPairsAffinityPodsMaps *topologyPairsMaps, topologyPairsAntiAffinityPodsMaps *topologyPairsMaps, err error) {
|
||||
affinity := pod.Spec.Affinity
|
||||
if affinity == nil || (affinity.PodAffinity == nil && affinity.PodAntiAffinity == nil) {
|
||||
return newTopologyPairsMaps(), newTopologyPairsMaps(), nil
|
||||
}
|
||||
|
||||
allNodeNames := make([]string, 0, len(nodeInfoMap))
|
||||
for name := range nodeInfoMap {
|
||||
allNodeNames = append(allNodeNames, name)
|
||||
}
|
||||
|
||||
errCh := schedutil.NewErrorChannel()
|
||||
var lock sync.Mutex
|
||||
var firstError error
|
||||
topologyPairsAffinityPodsMaps = newTopologyPairsMaps()
|
||||
topologyPairsAntiAffinityPodsMaps = newTopologyPairsMaps()
|
||||
appendResult := func(nodeName string, nodeTopologyPairsAffinityPodsMaps, nodeTopologyPairsAntiAffinityPodsMaps *topologyPairsMaps) {
|
||||
@ -442,28 +795,21 @@ func getTPMapMatchingIncomingAffinityAntiAffinity(pod *v1.Pod, nodeInfoMap map[s
|
||||
}
|
||||
}
|
||||
|
||||
catchError := func(err error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if firstError == nil {
|
||||
firstError = err
|
||||
}
|
||||
}
|
||||
|
||||
affinityTerms := GetPodAffinityTerms(affinity.PodAffinity)
|
||||
affinityProperties, err := getAffinityTermProperties(pod, affinityTerms)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
antiAffinityTerms := GetPodAntiAffinityTerms(affinity.PodAntiAffinity)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
processNode := func(i int) {
|
||||
nodeInfo := nodeInfoMap[allNodeNames[i]]
|
||||
nodeInfo := allNodes[i]
|
||||
node := nodeInfo.Node()
|
||||
if node == nil {
|
||||
catchError(fmt.Errorf("nodeInfo.Node is nil"))
|
||||
klog.Error("node not found")
|
||||
return
|
||||
}
|
||||
nodeTopologyPairsAffinityPodsMaps := newTopologyPairsMaps()
|
||||
@ -483,8 +829,7 @@ func getTPMapMatchingIncomingAffinityAntiAffinity(pod *v1.Pod, nodeInfoMap map[s
|
||||
namespaces := priorityutil.GetNamespacesFromPodAffinityTerm(pod, &term)
|
||||
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
|
||||
if err != nil {
|
||||
catchError(err)
|
||||
cancel()
|
||||
errCh.SendErrorWithCancel(err, cancel)
|
||||
return
|
||||
}
|
||||
if priorityutil.PodMatchesTermsNamespaceAndSelector(existingPod, namespaces, selector) {
|
||||
@ -495,12 +840,18 @@ func getTPMapMatchingIncomingAffinityAntiAffinity(pod *v1.Pod, nodeInfoMap map[s
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(nodeTopologyPairsAffinityPodsMaps.topologyPairToPods) > 0 || len(nodeTopologyPairsAntiAffinityPodsMaps.topologyPairToPods) > 0 {
|
||||
appendResult(node.Name, nodeTopologyPairsAffinityPodsMaps, nodeTopologyPairsAntiAffinityPodsMaps)
|
||||
}
|
||||
}
|
||||
workqueue.ParallelizeUntil(ctx, 16, len(allNodeNames), processNode)
|
||||
return topologyPairsAffinityPodsMaps, topologyPairsAntiAffinityPodsMaps, firstError
|
||||
workqueue.ParallelizeUntil(ctx, 16, len(allNodes), processNode)
|
||||
|
||||
if err := errCh.ReceiveError(); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return topologyPairsAffinityPodsMaps, topologyPairsAntiAffinityPodsMaps, nil
|
||||
}
|
||||
|
||||
// targetPodMatchesAffinityOfPod returns true if "targetPod" matches ALL affinity terms of
|
||||
|
664
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/predicates.go
generated
vendored
664
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/predicates.go
generated
vendored
File diff suppressed because it is too large
Load Diff
85
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/testing_helper.go
generated
vendored
85
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/testing_helper.go
generated
vendored
@ -1,85 +0,0 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package predicates
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
)
|
||||
|
||||
// FakePersistentVolumeClaimInfo declares a []v1.PersistentVolumeClaim type for testing.
|
||||
type FakePersistentVolumeClaimInfo []v1.PersistentVolumeClaim
|
||||
|
||||
// GetPersistentVolumeClaimInfo gets PVC matching the namespace and PVC ID.
|
||||
func (pvcs FakePersistentVolumeClaimInfo) GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*v1.PersistentVolumeClaim, error) {
|
||||
for _, pvc := range pvcs {
|
||||
if pvc.Name == pvcID && pvc.Namespace == namespace {
|
||||
return &pvc, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("Unable to find persistent volume claim: %s/%s", namespace, pvcID)
|
||||
}
|
||||
|
||||
// FakeNodeInfo declares a v1.Node type for testing.
|
||||
type FakeNodeInfo v1.Node
|
||||
|
||||
// GetNodeInfo return a fake node info object.
|
||||
func (n FakeNodeInfo) GetNodeInfo(nodeName string) (*v1.Node, error) {
|
||||
node := v1.Node(n)
|
||||
return &node, nil
|
||||
}
|
||||
|
||||
// FakeNodeListInfo declares a []v1.Node type for testing.
|
||||
type FakeNodeListInfo []v1.Node
|
||||
|
||||
// GetNodeInfo returns a fake node object in the fake nodes.
|
||||
func (nodes FakeNodeListInfo) GetNodeInfo(nodeName string) (*v1.Node, error) {
|
||||
for _, node := range nodes {
|
||||
if node.Name == nodeName {
|
||||
return &node, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("Unable to find node: %s", nodeName)
|
||||
}
|
||||
|
||||
// FakePersistentVolumeInfo declares a []v1.PersistentVolume type for testing.
|
||||
type FakePersistentVolumeInfo []v1.PersistentVolume
|
||||
|
||||
// GetPersistentVolumeInfo returns a fake PV object in the fake PVs by PV ID.
|
||||
func (pvs FakePersistentVolumeInfo) GetPersistentVolumeInfo(pvID string) (*v1.PersistentVolume, error) {
|
||||
for _, pv := range pvs {
|
||||
if pv.Name == pvID {
|
||||
return &pv, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("Unable to find persistent volume: %s", pvID)
|
||||
}
|
||||
|
||||
// FakeStorageClassInfo declares a []storagev1.StorageClass type for testing.
|
||||
type FakeStorageClassInfo []storagev1.StorageClass
|
||||
|
||||
// GetStorageClassInfo returns a fake storage class object in the fake storage classes by name.
|
||||
func (classes FakeStorageClassInfo) GetStorageClassInfo(name string) (*storagev1.StorageClass, error) {
|
||||
for _, sc := range classes {
|
||||
if sc.Name == name {
|
||||
return &sc, nil
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("Unable to find storage class: %s", name)
|
||||
}
|
64
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/utils.go
generated
vendored
64
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/utils.go
generated
vendored
@ -17,8 +17,15 @@ limitations under the License.
|
||||
package predicates
|
||||
|
||||
import (
|
||||
"k8s.io/api/core/v1"
|
||||
"strings"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
storagev1 "k8s.io/api/storage/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
csilibplugins "k8s.io/csi-translation-lib/plugins"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
||||
)
|
||||
|
||||
@ -60,7 +67,7 @@ func FilterPodsByNamespace(pods []*v1.Pod, ns string) []*v1.Pod {
|
||||
|
||||
// CreateSelectorFromLabels is used to define a selector that corresponds to the keys in a map.
|
||||
func CreateSelectorFromLabels(aL map[string]string) labels.Selector {
|
||||
if aL == nil || len(aL) == 0 {
|
||||
if len(aL) == 0 {
|
||||
return labels.Everything()
|
||||
}
|
||||
return labels.Set(aL).AsSelector()
|
||||
@ -87,3 +94,56 @@ func SetPredicatesOrderingDuringTest(value []string) func() {
|
||||
predicatesOrdering = origVal
|
||||
}
|
||||
}
|
||||
|
||||
// isCSIMigrationOn returns a boolean value indicating whether
|
||||
// the CSI migration has been enabled for a particular storage plugin.
|
||||
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
|
||||
if csiNode == nil || len(pluginName) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// In-tree storage to CSI driver migration feature should be enabled,
|
||||
// along with the plugin-specific one
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigration) {
|
||||
return false
|
||||
}
|
||||
|
||||
switch pluginName {
|
||||
case csilibplugins.AWSEBSInTreePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationAWS) {
|
||||
return false
|
||||
}
|
||||
case csilibplugins.GCEPDInTreePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationGCE) {
|
||||
return false
|
||||
}
|
||||
case csilibplugins.AzureDiskInTreePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationAzureDisk) {
|
||||
return false
|
||||
}
|
||||
case csilibplugins.CinderInTreePluginName:
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationOpenStack) {
|
||||
return false
|
||||
}
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
// The plugin name should be listed in the CSINode object annotation.
|
||||
// This indicates that the plugin has been migrated to a CSI driver in the node.
|
||||
csiNodeAnn := csiNode.GetAnnotations()
|
||||
if csiNodeAnn == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var mpaSet sets.String
|
||||
mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
|
||||
if len(mpa) == 0 {
|
||||
mpaSet = sets.NewString()
|
||||
} else {
|
||||
tok := strings.Split(mpa, ",")
|
||||
mpaSet = sets.NewString(tok...)
|
||||
}
|
||||
|
||||
return mpaSet.Has(pluginName)
|
||||
}
|
||||
|
68
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util/non_zero.go
generated
vendored
68
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util/non_zero.go
generated
vendored
@ -16,7 +16,10 @@ limitations under the License.
|
||||
|
||||
package util
|
||||
|
||||
import "k8s.io/api/core/v1"
|
||||
import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
)
|
||||
|
||||
// For each of these resources, a pod that doesn't request the resource explicitly
|
||||
// will be treated as having requested the amount indicated below, for the purpose
|
||||
@ -26,27 +29,50 @@ import "k8s.io/api/core/v1"
|
||||
// consuming no resources whatsoever. We chose these values to be similar to the
|
||||
// resources that we give to cluster addon pods (#10653). But they are pretty arbitrary.
|
||||
// As described in #11713, we use request instead of limit to deal with resource requirements.
|
||||
const (
|
||||
// DefaultMilliCPURequest defines default milli cpu request number.
|
||||
DefaultMilliCPURequest int64 = 100 // 0.1 core
|
||||
// DefaultMemoryRequest defines default memory request size.
|
||||
DefaultMemoryRequest int64 = 200 * 1024 * 1024 // 200 MB
|
||||
)
|
||||
|
||||
// DefaultMilliCPURequest defines default milli cpu request number.
|
||||
const DefaultMilliCPURequest int64 = 100 // 0.1 core
|
||||
// DefaultMemoryRequest defines default memory request size.
|
||||
const DefaultMemoryRequest int64 = 200 * 1024 * 1024 // 200 MB
|
||||
|
||||
// GetNonzeroRequests returns the default resource request if none is found or
|
||||
// GetNonzeroRequests returns the default cpu and memory resource request if none is found or
|
||||
// what is provided on the request.
|
||||
func GetNonzeroRequests(requests *v1.ResourceList) (int64, int64) {
|
||||
var outMilliCPU, outMemory int64
|
||||
// Override if un-set, but not if explicitly set to zero
|
||||
if _, found := (*requests)[v1.ResourceCPU]; !found {
|
||||
outMilliCPU = DefaultMilliCPURequest
|
||||
} else {
|
||||
outMilliCPU = requests.Cpu().MilliValue()
|
||||
}
|
||||
// Override if un-set, but not if explicitly set to zero
|
||||
if _, found := (*requests)[v1.ResourceMemory]; !found {
|
||||
outMemory = DefaultMemoryRequest
|
||||
} else {
|
||||
outMemory = requests.Memory().Value()
|
||||
}
|
||||
return outMilliCPU, outMemory
|
||||
return GetNonzeroRequestForResource(v1.ResourceCPU, requests),
|
||||
GetNonzeroRequestForResource(v1.ResourceMemory, requests)
|
||||
}
|
||||
|
||||
// GetNonzeroRequestForResource returns the default resource request if none is found or
|
||||
// what is provided on the request.
|
||||
func GetNonzeroRequestForResource(resource v1.ResourceName, requests *v1.ResourceList) int64 {
|
||||
switch resource {
|
||||
case v1.ResourceCPU:
|
||||
// Override if un-set, but not if explicitly set to zero
|
||||
if _, found := (*requests)[v1.ResourceCPU]; !found {
|
||||
return DefaultMilliCPURequest
|
||||
}
|
||||
return requests.Cpu().MilliValue()
|
||||
case v1.ResourceMemory:
|
||||
// Override if un-set, but not if explicitly set to zero
|
||||
if _, found := (*requests)[v1.ResourceMemory]; !found {
|
||||
return DefaultMemoryRequest
|
||||
}
|
||||
return requests.Memory().Value()
|
||||
case v1.ResourceEphemeralStorage:
|
||||
quantity, found := (*requests)[v1.ResourceEphemeralStorage]
|
||||
if !found {
|
||||
return 0
|
||||
}
|
||||
return quantity.Value()
|
||||
default:
|
||||
if v1helper.IsScalarResourceName(resource) {
|
||||
quantity, found := (*requests)[resource]
|
||||
if !found {
|
||||
return 0
|
||||
}
|
||||
return quantity.Value()
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
12
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/scheduler_interface.go
generated
vendored
12
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/scheduler_interface.go
generated
vendored
@ -18,7 +18,7 @@ package algorithm
|
||||
|
||||
import (
|
||||
"k8s.io/api/core/v1"
|
||||
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
||||
extenderv1 "k8s.io/kubernetes/pkg/scheduler/apis/extender/v1"
|
||||
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
|
||||
)
|
||||
|
||||
@ -34,12 +34,12 @@ type SchedulerExtender interface {
|
||||
// the list of failed nodes and failure reasons.
|
||||
Filter(pod *v1.Pod,
|
||||
nodes []*v1.Node, nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo,
|
||||
) (filteredNodes []*v1.Node, failedNodesMap schedulerapi.FailedNodesMap, err error)
|
||||
) (filteredNodes []*v1.Node, failedNodesMap extenderv1.FailedNodesMap, err error)
|
||||
|
||||
// Prioritize based on extender-implemented priority functions. The returned scores & weight
|
||||
// are used to compute the weighted score for an extender. The weighted scores are added to
|
||||
// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
|
||||
Prioritize(pod *v1.Pod, nodes []*v1.Node) (hostPriorities *schedulerapi.HostPriorityList, weight int, err error)
|
||||
// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
|
||||
Prioritize(pod *v1.Pod, nodes []*v1.Node) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error)
|
||||
|
||||
// Bind delegates the action of binding a pod to a node to the extender.
|
||||
Bind(binding *v1.Binding) error
|
||||
@ -61,9 +61,9 @@ type SchedulerExtender interface {
|
||||
// 2. A different set of victim pod for every given candidate node after preemption phase of extender.
|
||||
ProcessPreemption(
|
||||
pod *v1.Pod,
|
||||
nodeToVictims map[*v1.Node]*schedulerapi.Victims,
|
||||
nodeToVictims map[*v1.Node]*extenderv1.Victims,
|
||||
nodeNameToInfo map[string]*schedulernodeinfo.NodeInfo,
|
||||
) (map[*v1.Node]*schedulerapi.Victims, error)
|
||||
) (map[*v1.Node]*extenderv1.Victims, error)
|
||||
|
||||
// SupportsPreemption returns if the scheduler extender support preemption or not.
|
||||
SupportsPreemption() bool
|
||||
|
101
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/types.go
generated
vendored
101
vendor/k8s.io/kubernetes/pkg/scheduler/algorithm/types.go
generated
vendored
@ -17,68 +17,22 @@ limitations under the License.
|
||||
package algorithm
|
||||
|
||||
import (
|
||||
apps "k8s.io/api/apps/v1"
|
||||
"k8s.io/api/core/v1"
|
||||
policyv1beta1 "k8s.io/api/policy/v1beta1"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
||||
appslisters "k8s.io/client-go/listers/apps/v1"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/apps"
|
||||
api "k8s.io/kubernetes/pkg/apis/core"
|
||||
)
|
||||
|
||||
// NodeFieldSelectorKeys is a map that: the key are node field selector keys; the values are
|
||||
// NodeFieldSelectorKeys is a map that: the keys are node field selector keys; the values are
|
||||
// the functions to get the value of the node field.
|
||||
var NodeFieldSelectorKeys = map[string]func(*v1.Node) string{
|
||||
schedulerapi.NodeFieldSelectorKeyNodeName: func(n *v1.Node) string { return n.Name },
|
||||
api.ObjectNameField: func(n *v1.Node) string { return n.Name },
|
||||
}
|
||||
|
||||
// NodeLister interface represents anything that can list nodes for a scheduler.
|
||||
type NodeLister interface {
|
||||
// We explicitly return []*v1.Node, instead of v1.NodeList, to avoid
|
||||
// performing expensive copies that are unneeded.
|
||||
List() ([]*v1.Node, error)
|
||||
}
|
||||
|
||||
// PodFilter is a function to filter a pod. If pod passed return true else return false.
|
||||
type PodFilter func(*v1.Pod) bool
|
||||
|
||||
// PodLister interface represents anything that can list pods for a scheduler.
|
||||
type PodLister interface {
|
||||
// We explicitly return []*v1.Pod, instead of v1.PodList, to avoid
|
||||
// performing expensive copies that are unneeded.
|
||||
List(labels.Selector) ([]*v1.Pod, error)
|
||||
// This is similar to "List()", but the returned slice does not
|
||||
// contain pods that don't pass `podFilter`.
|
||||
FilteredList(podFilter PodFilter, selector labels.Selector) ([]*v1.Pod, error)
|
||||
}
|
||||
|
||||
// ServiceLister interface represents anything that can produce a list of services; the list is consumed by a scheduler.
|
||||
type ServiceLister interface {
|
||||
// Lists all the services
|
||||
List(labels.Selector) ([]*v1.Service, error)
|
||||
// Gets the services for the given pod
|
||||
GetPodServices(*v1.Pod) ([]*v1.Service, error)
|
||||
}
|
||||
|
||||
// ControllerLister interface represents anything that can produce a list of ReplicationController; the list is consumed by a scheduler.
|
||||
type ControllerLister interface {
|
||||
// Lists all the replication controllers
|
||||
List(labels.Selector) ([]*v1.ReplicationController, error)
|
||||
// Gets the services for the given pod
|
||||
GetPodControllers(*v1.Pod) ([]*v1.ReplicationController, error)
|
||||
}
|
||||
|
||||
// ReplicaSetLister interface represents anything that can produce a list of ReplicaSet; the list is consumed by a scheduler.
|
||||
type ReplicaSetLister interface {
|
||||
// Gets the replicasets for the given pod
|
||||
GetPodReplicaSets(*v1.Pod) ([]*apps.ReplicaSet, error)
|
||||
}
|
||||
|
||||
// PDBLister interface represents anything that can list PodDisruptionBudget objects.
|
||||
type PDBLister interface {
|
||||
// List() returns a list of PodDisruptionBudgets matching the selector.
|
||||
List(labels.Selector) ([]*policyv1beta1.PodDisruptionBudget, error)
|
||||
}
|
||||
|
||||
var _ ControllerLister = &EmptyControllerLister{}
|
||||
var _ corelisters.ReplicationControllerLister = &EmptyControllerLister{}
|
||||
|
||||
// EmptyControllerLister implements ControllerLister on []v1.ReplicationController returning empty data
|
||||
type EmptyControllerLister struct{}
|
||||
@ -93,28 +47,53 @@ func (f EmptyControllerLister) GetPodControllers(pod *v1.Pod) (controllers []*v1
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var _ ReplicaSetLister = &EmptyReplicaSetLister{}
|
||||
// ReplicationControllers returns nil
|
||||
func (f EmptyControllerLister) ReplicationControllers(namespace string) corelisters.ReplicationControllerNamespaceLister {
|
||||
return nil
|
||||
}
|
||||
|
||||
var _ appslisters.ReplicaSetLister = &EmptyReplicaSetLister{}
|
||||
|
||||
// EmptyReplicaSetLister implements ReplicaSetLister on []extensions.ReplicaSet returning empty data
|
||||
type EmptyReplicaSetLister struct{}
|
||||
|
||||
// GetPodReplicaSets returns nil
|
||||
func (f EmptyReplicaSetLister) GetPodReplicaSets(pod *v1.Pod) (rss []*apps.ReplicaSet, err error) {
|
||||
// List returns nil
|
||||
func (f EmptyReplicaSetLister) List(labels.Selector) ([]*appsv1.ReplicaSet, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// GetPodReplicaSets returns nil
|
||||
func (f EmptyReplicaSetLister) GetPodReplicaSets(pod *v1.Pod) (rss []*appsv1.ReplicaSet, err error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ReplicaSets returns nil
|
||||
func (f EmptyReplicaSetLister) ReplicaSets(namespace string) appslisters.ReplicaSetNamespaceLister {
|
||||
return nil
|
||||
}
|
||||
|
||||
// StatefulSetLister interface represents anything that can produce a list of StatefulSet; the list is consumed by a scheduler.
|
||||
type StatefulSetLister interface {
|
||||
// Gets the StatefulSet for the given pod.
|
||||
GetPodStatefulSets(*v1.Pod) ([]*apps.StatefulSet, error)
|
||||
}
|
||||
|
||||
var _ StatefulSetLister = &EmptyStatefulSetLister{}
|
||||
var _ appslisters.StatefulSetLister = &EmptyStatefulSetLister{}
|
||||
|
||||
// EmptyStatefulSetLister implements StatefulSetLister on []apps.StatefulSet returning empty data.
|
||||
type EmptyStatefulSetLister struct{}
|
||||
|
||||
// GetPodStatefulSets of EmptyStatefulSetLister returns nil.
|
||||
func (f EmptyStatefulSetLister) GetPodStatefulSets(pod *v1.Pod) (sss []*apps.StatefulSet, err error) {
|
||||
// List returns nil
|
||||
func (f EmptyStatefulSetLister) List(labels.Selector) ([]*appsv1.StatefulSet, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// GetPodStatefulSets of EmptyStatefulSetLister returns nil.
|
||||
func (f EmptyStatefulSetLister) GetPodStatefulSets(pod *v1.Pod) (sss []*appsv1.StatefulSet, err error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// StatefulSets returns nil
|
||||
func (f EmptyStatefulSetLister) StatefulSets(namespace string) appslisters.StatefulSetNamespaceLister {
|
||||
return nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user