mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-01-22 12:49:29 +00:00
1854 lines
68 KiB
Go
1854 lines
68 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package kubelet
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"runtime"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/golang/glog"
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
utilvalidation "k8s.io/apimachinery/pkg/util/validation"
|
|
"k8s.io/apimachinery/pkg/util/validation/field"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
"k8s.io/client-go/tools/remotecommand"
|
|
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
|
"k8s.io/kubernetes/pkg/api/v1/resource"
|
|
podshelper "k8s.io/kubernetes/pkg/apis/core/pods"
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
|
"k8s.io/kubernetes/pkg/apis/core/v1/validation"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
"k8s.io/kubernetes/pkg/fieldpath"
|
|
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
|
|
"k8s.io/kubernetes/pkg/kubelet/cm"
|
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
|
"k8s.io/kubernetes/pkg/kubelet/envvars"
|
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
|
"k8s.io/kubernetes/pkg/kubelet/images"
|
|
"k8s.io/kubernetes/pkg/kubelet/server/portforward"
|
|
remotecommandserver "k8s.io/kubernetes/pkg/kubelet/server/remotecommand"
|
|
"k8s.io/kubernetes/pkg/kubelet/status"
|
|
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
|
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
|
utilfile "k8s.io/kubernetes/pkg/util/file"
|
|
"k8s.io/kubernetes/pkg/volume"
|
|
volumeutil "k8s.io/kubernetes/pkg/volume/util"
|
|
"k8s.io/kubernetes/pkg/volume/util/volumehelper"
|
|
volumevalidation "k8s.io/kubernetes/pkg/volume/validation"
|
|
"k8s.io/kubernetes/third_party/forked/golang/expansion"
|
|
)
|
|
|
|
// Get a list of pods that have data directories.
|
|
func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) {
|
|
podInfos, err := ioutil.ReadDir(kl.getPodsDir())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pods := []types.UID{}
|
|
for i := range podInfos {
|
|
if podInfos[i].IsDir() {
|
|
pods = append(pods, types.UID(podInfos[i].Name()))
|
|
}
|
|
}
|
|
return pods, nil
|
|
}
|
|
|
|
// GetActivePods returns non-terminal pods
|
|
func (kl *Kubelet) GetActivePods() []*v1.Pod {
|
|
allPods := kl.podManager.GetPods()
|
|
activePods := kl.filterOutTerminatedPods(allPods)
|
|
return activePods
|
|
}
|
|
|
|
// makeGPUDevices determines the devices for the given container.
|
|
// Experimental.
|
|
func (kl *Kubelet) makeGPUDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
|
|
if container.Resources.Limits.NvidiaGPU().IsZero() {
|
|
return nil, nil
|
|
}
|
|
|
|
nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var devices []kubecontainer.DeviceInfo
|
|
for _, path := range nvidiaGPUPaths {
|
|
// Devices have to be mapped one to one because of nvidia CUDA library requirements.
|
|
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
func makeAbsolutePath(goos, path string) string {
|
|
if goos != "windows" {
|
|
return "/" + path
|
|
}
|
|
// These are all for windows
|
|
// If there is a colon, give up.
|
|
if strings.Contains(path, ":") {
|
|
return path
|
|
}
|
|
// If there is a slash, but no drive, add 'c:'
|
|
if strings.HasPrefix(path, "/") || strings.HasPrefix(path, "\\") {
|
|
return "c:" + path
|
|
}
|
|
// Otherwise, add 'c:\'
|
|
return "c:\\" + path
|
|
}
|
|
|
|
// makeBlockVolumes maps the raw block devices specified in the path of the container
|
|
// Experimental
|
|
func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumeutil.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) {
|
|
var devices []kubecontainer.DeviceInfo
|
|
for _, device := range container.VolumeDevices {
|
|
// check path is absolute
|
|
if !filepath.IsAbs(device.DevicePath) {
|
|
return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath)
|
|
}
|
|
vol, ok := podVolumes[device.Name]
|
|
if !ok || vol.BlockVolumeMapper == nil {
|
|
glog.Errorf("Block volume cannot be satisfied for container %q, because the volume is missing or the volume mapper is nil: %+v", container.Name, device)
|
|
return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name)
|
|
}
|
|
// Get a symbolic link associated to a block device under pod device path
|
|
dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath()
|
|
symlinkPath := path.Join(dirPath, volName)
|
|
if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil {
|
|
return nil, checkErr
|
|
} else if islinkExist {
|
|
// Check readOnly in PVCVolumeSource and set read only permission if it's true.
|
|
permission := "mrw"
|
|
if vol.ReadOnly {
|
|
permission = "r"
|
|
}
|
|
glog.V(4).Infof("Device will be attached to container %q. Path on host: %v", container.Name, symlinkPath)
|
|
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission})
|
|
}
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// makeMounts determines the mount points for the given container.
|
|
func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain, podIP string, podVolumes kubecontainer.VolumeMap) ([]kubecontainer.Mount, error) {
|
|
// Kubernetes only mounts on /etc/hosts if:
|
|
// - container is not an infrastructure (pause) container
|
|
// - container is not already mounting on /etc/hosts
|
|
// - OS is not Windows
|
|
// Kubernetes will not mount /etc/hosts if:
|
|
// - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set.
|
|
mountEtcHostsFile := len(podIP) > 0 && runtime.GOOS != "windows"
|
|
glog.V(3).Infof("container: %v/%v/%v podIP: %q creating hosts mount: %v", pod.Namespace, pod.Name, container.Name, podIP, mountEtcHostsFile)
|
|
mounts := []kubecontainer.Mount{}
|
|
for _, mount := range container.VolumeMounts {
|
|
// do not mount /etc/hosts if container is already mounting on the path
|
|
mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath)
|
|
vol, ok := podVolumes[mount.Name]
|
|
if !ok || vol.Mounter == nil {
|
|
glog.Errorf("Mount cannot be satisfied for container %q, because the volume is missing or the volume mounter is nil: %+v", container.Name, mount)
|
|
return nil, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name)
|
|
}
|
|
|
|
relabelVolume := false
|
|
// If the volume supports SELinux and it has not been
|
|
// relabeled already and it is not a read-only volume,
|
|
// relabel it and mark it as labeled
|
|
if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SupportsSELinux && !vol.SELinuxLabeled {
|
|
vol.SELinuxLabeled = true
|
|
relabelVolume = true
|
|
}
|
|
hostPath, err := volume.GetPath(vol.Mounter)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if mount.SubPath != "" {
|
|
if filepath.IsAbs(mount.SubPath) {
|
|
return nil, fmt.Errorf("error SubPath `%s` must not be an absolute path", mount.SubPath)
|
|
}
|
|
|
|
err = volumevalidation.ValidatePathNoBacksteps(mount.SubPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to provision SubPath `%s`: %v", mount.SubPath, err)
|
|
}
|
|
|
|
fileinfo, err := os.Lstat(hostPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
perm := fileinfo.Mode()
|
|
|
|
hostPath = filepath.Join(hostPath, mount.SubPath)
|
|
|
|
if subPathExists, err := utilfile.FileOrSymlinkExists(hostPath); err != nil {
|
|
glog.Errorf("Could not determine if subPath %s exists; will not attempt to change its permissions", hostPath)
|
|
} else if !subPathExists {
|
|
// Create the sub path now because if it's auto-created later when referenced, it may have an
|
|
// incorrect ownership and mode. For example, the sub path directory must have at least g+rwx
|
|
// when the pod specifies an fsGroup, and if the directory is not created here, Docker will
|
|
// later auto-create it with the incorrect mode 0750
|
|
if err := os.MkdirAll(hostPath, perm); err != nil {
|
|
glog.Errorf("failed to mkdir:%s", hostPath)
|
|
return nil, err
|
|
}
|
|
|
|
// chmod the sub path because umask may have prevented us from making the sub path with the same
|
|
// permissions as the mounter path
|
|
if err := os.Chmod(hostPath, perm); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Docker Volume Mounts fail on Windows if it is not of the form C:/
|
|
containerPath := mount.MountPath
|
|
if runtime.GOOS == "windows" {
|
|
if (strings.HasPrefix(hostPath, "/") || strings.HasPrefix(hostPath, "\\")) && !strings.Contains(hostPath, ":") {
|
|
hostPath = "c:" + hostPath
|
|
}
|
|
}
|
|
if !filepath.IsAbs(containerPath) {
|
|
containerPath = makeAbsolutePath(runtime.GOOS, containerPath)
|
|
}
|
|
|
|
propagation, err := translateMountPropagation(mount.MountPropagation)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
glog.V(5).Infof("Pod %q container %q mount %q has propagation %q", format.Pod(pod), container.Name, mount.Name, propagation)
|
|
|
|
mounts = append(mounts, kubecontainer.Mount{
|
|
Name: mount.Name,
|
|
ContainerPath: containerPath,
|
|
HostPath: hostPath,
|
|
ReadOnly: mount.ReadOnly,
|
|
SELinuxRelabel: relabelVolume,
|
|
Propagation: propagation,
|
|
})
|
|
}
|
|
if mountEtcHostsFile {
|
|
hostAliases := pod.Spec.HostAliases
|
|
hostsMount, err := makeHostsMount(podDir, podIP, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
mounts = append(mounts, *hostsMount)
|
|
}
|
|
return mounts, nil
|
|
}
|
|
|
|
// translateMountPropagation transforms v1.MountPropagationMode to
|
|
// runtimeapi.MountPropagation.
|
|
func translateMountPropagation(mountMode *v1.MountPropagationMode) (runtimeapi.MountPropagation, error) {
|
|
if !utilfeature.DefaultFeatureGate.Enabled(features.MountPropagation) {
|
|
// mount propagation is disabled, use private as in the old versions
|
|
return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil
|
|
}
|
|
switch {
|
|
case mountMode == nil:
|
|
// HostToContainer is the default
|
|
return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil
|
|
case *mountMode == v1.MountPropagationHostToContainer:
|
|
return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil
|
|
case *mountMode == v1.MountPropagationBidirectional:
|
|
return runtimeapi.MountPropagation_PROPAGATION_BIDIRECTIONAL, nil
|
|
default:
|
|
return 0, fmt.Errorf("invalid MountPropagation mode: %q", mountMode)
|
|
}
|
|
}
|
|
|
|
// makeHostsMount makes the mountpoint for the hosts file that the containers
|
|
// in a pod are injected with.
|
|
func makeHostsMount(podDir, podIP, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) (*kubecontainer.Mount, error) {
|
|
hostsFilePath := path.Join(podDir, "etc-hosts")
|
|
if err := ensureHostsFile(hostsFilePath, podIP, hostName, hostDomainName, hostAliases, useHostNetwork); err != nil {
|
|
return nil, err
|
|
}
|
|
return &kubecontainer.Mount{
|
|
Name: "k8s-managed-etc-hosts",
|
|
ContainerPath: etcHostsPath,
|
|
HostPath: hostsFilePath,
|
|
ReadOnly: false,
|
|
SELinuxRelabel: true,
|
|
}, nil
|
|
}
|
|
|
|
// ensureHostsFile ensures that the given host file has an up-to-date ip, host
|
|
// name, and domain name.
|
|
func ensureHostsFile(fileName, hostIP, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) error {
|
|
var hostsFileContent []byte
|
|
var err error
|
|
|
|
if useHostNetwork {
|
|
// if Pod is using host network, read hosts file from the node's filesystem.
|
|
// `etcHostsPath` references the location of the hosts file on the node.
|
|
// `/etc/hosts` for *nix systems.
|
|
hostsFileContent, err = nodeHostsFileContent(etcHostsPath, hostAliases)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
// if Pod is not using host network, create a managed hosts file with Pod IP and other information.
|
|
hostsFileContent = managedHostsFileContent(hostIP, hostName, hostDomainName, hostAliases)
|
|
}
|
|
|
|
return ioutil.WriteFile(fileName, hostsFileContent, 0644)
|
|
}
|
|
|
|
// nodeHostsFileContent reads the content of node's hosts file.
|
|
func nodeHostsFileContent(hostsFilePath string, hostAliases []v1.HostAlias) ([]byte, error) {
|
|
hostsFileContent, err := ioutil.ReadFile(hostsFilePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
hostsFileContent = append(hostsFileContent, hostsEntriesFromHostAliases(hostAliases)...)
|
|
return hostsFileContent, nil
|
|
}
|
|
|
|
// managedHostsFileContent generates the content of the managed etc hosts based on Pod IP and other
|
|
// information.
|
|
func managedHostsFileContent(hostIP, hostName, hostDomainName string, hostAliases []v1.HostAlias) []byte {
|
|
var buffer bytes.Buffer
|
|
buffer.WriteString("# Kubernetes-managed hosts file.\n")
|
|
buffer.WriteString("127.0.0.1\tlocalhost\n") // ipv4 localhost
|
|
buffer.WriteString("::1\tlocalhost ip6-localhost ip6-loopback\n") // ipv6 localhost
|
|
buffer.WriteString("fe00::0\tip6-localnet\n")
|
|
buffer.WriteString("fe00::0\tip6-mcastprefix\n")
|
|
buffer.WriteString("fe00::1\tip6-allnodes\n")
|
|
buffer.WriteString("fe00::2\tip6-allrouters\n")
|
|
if len(hostDomainName) > 0 {
|
|
buffer.WriteString(fmt.Sprintf("%s\t%s.%s\t%s\n", hostIP, hostName, hostDomainName, hostName))
|
|
} else {
|
|
buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostIP, hostName))
|
|
}
|
|
hostsFileContent := buffer.Bytes()
|
|
hostsFileContent = append(hostsFileContent, hostsEntriesFromHostAliases(hostAliases)...)
|
|
return hostsFileContent
|
|
}
|
|
|
|
func hostsEntriesFromHostAliases(hostAliases []v1.HostAlias) []byte {
|
|
if len(hostAliases) == 0 {
|
|
return []byte{}
|
|
}
|
|
|
|
var buffer bytes.Buffer
|
|
buffer.WriteString("\n")
|
|
buffer.WriteString("# Entries added by HostAliases.\n")
|
|
// write each IP/hostname pair as an entry into hosts file
|
|
for _, hostAlias := range hostAliases {
|
|
for _, hostname := range hostAlias.Hostnames {
|
|
buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostAlias.IP, hostname))
|
|
}
|
|
}
|
|
return buffer.Bytes()
|
|
}
|
|
|
|
// truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars.
|
|
func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) {
|
|
// Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char).
|
|
const hostnameMaxLen = 63
|
|
if len(hostname) <= hostnameMaxLen {
|
|
return hostname, nil
|
|
}
|
|
truncated := hostname[:hostnameMaxLen]
|
|
glog.Errorf("hostname for pod:%q was longer than %d. Truncated hostname to :%q", podName, hostnameMaxLen, truncated)
|
|
// hostname should not end with '-' or '.'
|
|
truncated = strings.TrimRight(truncated, "-.")
|
|
if len(truncated) == 0 {
|
|
// This should never happen.
|
|
return "", fmt.Errorf("hostname for pod %q was invalid: %q", podName, hostname)
|
|
}
|
|
return truncated, nil
|
|
}
|
|
|
|
// GeneratePodHostNameAndDomain creates a hostname and domain name for a pod,
|
|
// given that pod's spec and annotations or returns an error.
|
|
func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) {
|
|
// TODO(vmarmol): Handle better.
|
|
clusterDomain := kl.dnsConfigurer.ClusterDomain
|
|
|
|
hostname := pod.Name
|
|
if len(pod.Spec.Hostname) > 0 {
|
|
if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Hostname); len(msgs) != 0 {
|
|
return "", "", fmt.Errorf("Pod Hostname %q is not a valid DNS label: %s", pod.Spec.Hostname, strings.Join(msgs, ";"))
|
|
}
|
|
hostname = pod.Spec.Hostname
|
|
}
|
|
|
|
hostname, err := truncatePodHostnameIfNeeded(pod.Name, hostname)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
|
|
hostDomain := ""
|
|
if len(pod.Spec.Subdomain) > 0 {
|
|
if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Subdomain); len(msgs) != 0 {
|
|
return "", "", fmt.Errorf("Pod Subdomain %q is not a valid DNS label: %s", pod.Spec.Subdomain, strings.Join(msgs, ";"))
|
|
}
|
|
hostDomain = fmt.Sprintf("%s.%s.svc.%s", pod.Spec.Subdomain, pod.Namespace, clusterDomain)
|
|
}
|
|
|
|
return hostname, hostDomain, nil
|
|
}
|
|
|
|
// GetPodCgroupParent gets pod cgroup parent from container manager.
|
|
func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string {
|
|
pcm := kl.containerManager.NewPodContainerManager()
|
|
_, cgroupParent := pcm.GetPodContainerName(pod)
|
|
return cgroupParent
|
|
}
|
|
|
|
// GenerateRunContainerOptions generates the RunContainerOptions, which can be used by
|
|
// the container runtime to set parameters for launching a container.
|
|
func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Container, podIP string) (*kubecontainer.RunContainerOptions, error) {
|
|
opts, err := kl.containerManager.GetResources(pod, container)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cgroupParent := kl.GetPodCgroupParent(pod)
|
|
opts.CgroupParent = cgroupParent
|
|
hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opts.Hostname = hostname
|
|
podName := volumehelper.GetUniquePodName(pod)
|
|
volumes := kl.volumeManager.GetMountedVolumesForPod(podName)
|
|
|
|
opts.PortMappings = kubecontainer.MakePortMappings(container)
|
|
// TODO(random-liu): Move following convert functions into pkg/kubelet/container
|
|
devices, err := kl.makeGPUDevices(pod, container)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opts.Devices = append(opts.Devices, devices...)
|
|
|
|
// TODO: remove feature gate check after no longer needed
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
|
|
blkutil := volumeutil.NewBlockVolumePathHandler()
|
|
blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opts.Devices = append(opts.Devices, blkVolumes...)
|
|
}
|
|
|
|
mounts, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opts.Mounts = append(opts.Mounts, mounts...)
|
|
|
|
envs, err := kl.makeEnvironmentVariables(pod, container, podIP)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opts.Envs = append(opts.Envs, envs...)
|
|
|
|
// Disabling adding TerminationMessagePath on Windows as these files would be mounted as docker volume and
|
|
// Docker for Windows has a bug where only directories can be mounted
|
|
if len(container.TerminationMessagePath) != 0 && runtime.GOOS != "windows" {
|
|
p := kl.getPodContainerDir(pod.UID, container.Name)
|
|
if err := os.MkdirAll(p, 0750); err != nil {
|
|
glog.Errorf("Error on creating %q: %v", p, err)
|
|
} else {
|
|
opts.PodContainerDir = p
|
|
}
|
|
}
|
|
|
|
// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
|
|
if kl.experimentalHostUserNamespaceDefaulting {
|
|
opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)
|
|
}
|
|
|
|
return opts, nil
|
|
}
|
|
|
|
var masterServices = sets.NewString("kubernetes")
|
|
|
|
// getServiceEnvVarMap makes a map[string]string of env vars for services a
|
|
// pod in namespace ns should see.
|
|
func (kl *Kubelet) getServiceEnvVarMap(ns string) (map[string]string, error) {
|
|
var (
|
|
serviceMap = make(map[string]*v1.Service)
|
|
m = make(map[string]string)
|
|
)
|
|
|
|
// Get all service resources from the master (via a cache),
|
|
// and populate them into service environment variables.
|
|
if kl.serviceLister == nil {
|
|
// Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars.
|
|
return m, nil
|
|
}
|
|
services, err := kl.serviceLister.List(labels.Everything())
|
|
if err != nil {
|
|
return m, fmt.Errorf("failed to list services when setting up env vars")
|
|
}
|
|
|
|
// project the services in namespace ns onto the master services
|
|
for i := range services {
|
|
service := services[i]
|
|
// ignore services where ClusterIP is "None" or empty
|
|
if !v1helper.IsServiceIPSet(service) {
|
|
continue
|
|
}
|
|
serviceName := service.Name
|
|
|
|
switch service.Namespace {
|
|
// for the case whether the master service namespace is the namespace the pod
|
|
// is in, the pod should receive all the services in the namespace.
|
|
//
|
|
// ordering of the case clauses below enforces this
|
|
case ns:
|
|
serviceMap[serviceName] = service
|
|
case kl.masterServiceNamespace:
|
|
if masterServices.Has(serviceName) {
|
|
if _, exists := serviceMap[serviceName]; !exists {
|
|
serviceMap[serviceName] = service
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mappedServices := []*v1.Service{}
|
|
for key := range serviceMap {
|
|
mappedServices = append(mappedServices, serviceMap[key])
|
|
}
|
|
|
|
for _, e := range envvars.FromServices(mappedServices) {
|
|
m[e.Name] = e.Value
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
// Make the environment variables for a pod in the given namespace.
|
|
func (kl *Kubelet) makeEnvironmentVariables(pod *v1.Pod, container *v1.Container, podIP string) ([]kubecontainer.EnvVar, error) {
|
|
var result []kubecontainer.EnvVar
|
|
// Note: These are added to the docker Config, but are not included in the checksum computed
|
|
// by kubecontainer.HashContainer(...). That way, we can still determine whether an
|
|
// v1.Container is already running by its hash. (We don't want to restart a container just
|
|
// because some service changed.)
|
|
//
|
|
// Note that there is a race between Kubelet seeing the pod and kubelet seeing the service.
|
|
// To avoid this users can: (1) wait between starting a service and starting; or (2) detect
|
|
// missing service env var and exit and be restarted; or (3) use DNS instead of env vars
|
|
// and keep trying to resolve the DNS name of the service (recommended).
|
|
serviceEnv, err := kl.getServiceEnvVarMap(pod.Namespace)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
|
|
var (
|
|
configMaps = make(map[string]*v1.ConfigMap)
|
|
secrets = make(map[string]*v1.Secret)
|
|
tmpEnv = make(map[string]string)
|
|
)
|
|
|
|
// Env will override EnvFrom variables.
|
|
// Process EnvFrom first then allow Env to replace existing values.
|
|
for _, envFrom := range container.EnvFrom {
|
|
switch {
|
|
case envFrom.ConfigMapRef != nil:
|
|
cm := envFrom.ConfigMapRef
|
|
name := cm.Name
|
|
configMap, ok := configMaps[name]
|
|
if !ok {
|
|
if kl.kubeClient == nil {
|
|
return result, fmt.Errorf("Couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name)
|
|
}
|
|
optional := cm.Optional != nil && *cm.Optional
|
|
configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name)
|
|
if err != nil {
|
|
if errors.IsNotFound(err) && optional {
|
|
// ignore error when marked optional
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
configMaps[name] = configMap
|
|
}
|
|
|
|
invalidKeys := []string{}
|
|
for k, v := range configMap.Data {
|
|
if len(envFrom.Prefix) > 0 {
|
|
k = envFrom.Prefix + k
|
|
}
|
|
if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 {
|
|
invalidKeys = append(invalidKeys, k)
|
|
continue
|
|
}
|
|
tmpEnv[k] = v
|
|
}
|
|
if len(invalidKeys) > 0 {
|
|
sort.Strings(invalidKeys)
|
|
kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom configMap %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name)
|
|
}
|
|
case envFrom.SecretRef != nil:
|
|
s := envFrom.SecretRef
|
|
name := s.Name
|
|
secret, ok := secrets[name]
|
|
if !ok {
|
|
if kl.kubeClient == nil {
|
|
return result, fmt.Errorf("Couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name)
|
|
}
|
|
optional := s.Optional != nil && *s.Optional
|
|
secret, err = kl.secretManager.GetSecret(pod.Namespace, name)
|
|
if err != nil {
|
|
if errors.IsNotFound(err) && optional {
|
|
// ignore error when marked optional
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
secrets[name] = secret
|
|
}
|
|
|
|
invalidKeys := []string{}
|
|
for k, v := range secret.Data {
|
|
if len(envFrom.Prefix) > 0 {
|
|
k = envFrom.Prefix + k
|
|
}
|
|
if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 {
|
|
invalidKeys = append(invalidKeys, k)
|
|
continue
|
|
}
|
|
tmpEnv[k] = string(v)
|
|
}
|
|
if len(invalidKeys) > 0 {
|
|
sort.Strings(invalidKeys)
|
|
kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom secret %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Determine the final values of variables:
|
|
//
|
|
// 1. Determine the final value of each variable:
|
|
// a. If the variable's Value is set, expand the `$(var)` references to other
|
|
// variables in the .Value field; the sources of variables are the declared
|
|
// variables of the container and the service environment variables
|
|
// b. If a source is defined for an environment variable, resolve the source
|
|
// 2. Create the container's environment in the order variables are declared
|
|
// 3. Add remaining service environment vars
|
|
var (
|
|
mappingFunc = expansion.MappingFuncFor(tmpEnv, serviceEnv)
|
|
)
|
|
for _, envVar := range container.Env {
|
|
runtimeVal := envVar.Value
|
|
if runtimeVal != "" {
|
|
// Step 1a: expand variable references
|
|
runtimeVal = expansion.Expand(runtimeVal, mappingFunc)
|
|
} else if envVar.ValueFrom != nil {
|
|
// Step 1b: resolve alternate env var sources
|
|
switch {
|
|
case envVar.ValueFrom.FieldRef != nil:
|
|
runtimeVal, err = kl.podFieldSelectorRuntimeValue(envVar.ValueFrom.FieldRef, pod, podIP)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
case envVar.ValueFrom.ResourceFieldRef != nil:
|
|
defaultedPod, defaultedContainer, err := kl.defaultPodLimitsForDownwardAPI(pod, container)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
runtimeVal, err = containerResourceRuntimeValue(envVar.ValueFrom.ResourceFieldRef, defaultedPod, defaultedContainer)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
case envVar.ValueFrom.ConfigMapKeyRef != nil:
|
|
cm := envVar.ValueFrom.ConfigMapKeyRef
|
|
name := cm.Name
|
|
key := cm.Key
|
|
optional := cm.Optional != nil && *cm.Optional
|
|
configMap, ok := configMaps[name]
|
|
if !ok {
|
|
if kl.kubeClient == nil {
|
|
return result, fmt.Errorf("Couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name)
|
|
}
|
|
configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name)
|
|
if err != nil {
|
|
if errors.IsNotFound(err) && optional {
|
|
// ignore error when marked optional
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
configMaps[name] = configMap
|
|
}
|
|
runtimeVal, ok = configMap.Data[key]
|
|
if !ok {
|
|
if optional {
|
|
continue
|
|
}
|
|
return result, fmt.Errorf("Couldn't find key %v in ConfigMap %v/%v", key, pod.Namespace, name)
|
|
}
|
|
case envVar.ValueFrom.SecretKeyRef != nil:
|
|
s := envVar.ValueFrom.SecretKeyRef
|
|
name := s.Name
|
|
key := s.Key
|
|
optional := s.Optional != nil && *s.Optional
|
|
secret, ok := secrets[name]
|
|
if !ok {
|
|
if kl.kubeClient == nil {
|
|
return result, fmt.Errorf("Couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name)
|
|
}
|
|
secret, err = kl.secretManager.GetSecret(pod.Namespace, name)
|
|
if err != nil {
|
|
if errors.IsNotFound(err) && optional {
|
|
// ignore error when marked optional
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
secrets[name] = secret
|
|
}
|
|
runtimeValBytes, ok := secret.Data[key]
|
|
if !ok {
|
|
if optional {
|
|
continue
|
|
}
|
|
return result, fmt.Errorf("Couldn't find key %v in Secret %v/%v", key, pod.Namespace, name)
|
|
}
|
|
runtimeVal = string(runtimeValBytes)
|
|
}
|
|
}
|
|
// Accesses apiserver+Pods.
|
|
// So, the master may set service env vars, or kubelet may. In case both are doing
|
|
// it, we delete the key from the kubelet-generated ones so we don't have duplicate
|
|
// env vars.
|
|
// TODO: remove this next line once all platforms use apiserver+Pods.
|
|
delete(serviceEnv, envVar.Name)
|
|
|
|
tmpEnv[envVar.Name] = runtimeVal
|
|
}
|
|
|
|
// Append the env vars
|
|
for k, v := range tmpEnv {
|
|
result = append(result, kubecontainer.EnvVar{Name: k, Value: v})
|
|
}
|
|
|
|
// Append remaining service env vars.
|
|
for k, v := range serviceEnv {
|
|
// Accesses apiserver+Pods.
|
|
// So, the master may set service env vars, or kubelet may. In case both are doing
|
|
// it, we skip the key from the kubelet-generated ones so we don't have duplicate
|
|
// env vars.
|
|
// TODO: remove this next line once all platforms use apiserver+Pods.
|
|
if _, present := tmpEnv[k]; !present {
|
|
result = append(result, kubecontainer.EnvVar{Name: k, Value: v})
|
|
}
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
// podFieldSelectorRuntimeValue returns the runtime value of the given
|
|
// selector for a pod.
|
|
func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod *v1.Pod, podIP string) (string, error) {
|
|
internalFieldPath, _, err := podshelper.ConvertDownwardAPIFieldLabel(fs.APIVersion, fs.FieldPath, "")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
switch internalFieldPath {
|
|
case "spec.nodeName":
|
|
return pod.Spec.NodeName, nil
|
|
case "spec.serviceAccountName":
|
|
return pod.Spec.ServiceAccountName, nil
|
|
case "status.hostIP":
|
|
hostIP, err := kl.getHostIPAnyWay()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return hostIP.String(), nil
|
|
case "status.podIP":
|
|
return podIP, nil
|
|
}
|
|
return fieldpath.ExtractFieldPathAsString(pod, internalFieldPath)
|
|
}
|
|
|
|
// containerResourceRuntimeValue returns the value of the provided container resource
|
|
func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, container *v1.Container) (string, error) {
|
|
containerName := fs.ContainerName
|
|
if len(containerName) == 0 {
|
|
return resource.ExtractContainerResourceValue(fs, container)
|
|
}
|
|
return resource.ExtractResourceValueByContainerName(fs, pod, containerName)
|
|
}
|
|
|
|
// One of the following arguments must be non-nil: runningPod, status.
|
|
// TODO: Modify containerRuntime.KillPod() to accept the right arguments.
|
|
func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error {
|
|
var p kubecontainer.Pod
|
|
if runningPod != nil {
|
|
p = *runningPod
|
|
} else if status != nil {
|
|
p = kubecontainer.ConvertPodStatusToRunningPod(kl.GetRuntime().Type(), status)
|
|
} else {
|
|
return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status")
|
|
}
|
|
|
|
// Call the container runtime KillPod method which stops all running containers of the pod
|
|
if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
|
|
return err
|
|
}
|
|
if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
|
|
glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// makePodDataDirs creates the dirs for the pod datas.
|
|
func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error {
|
|
uid := pod.UID
|
|
if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// getPullSecretsForPod inspects the Pod and retrieves the referenced pull
|
|
// secrets.
|
|
func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret {
|
|
pullSecrets := []v1.Secret{}
|
|
|
|
for _, secretRef := range pod.Spec.ImagePullSecrets {
|
|
secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name)
|
|
if err != nil {
|
|
glog.Warningf("Unable to retrieve pull secret %s/%s for %s/%s due to %v. The image pull may not succeed.", pod.Namespace, secretRef.Name, pod.Namespace, pod.Name, err)
|
|
continue
|
|
}
|
|
|
|
pullSecrets = append(pullSecrets, *secret)
|
|
}
|
|
|
|
return pullSecrets
|
|
}
|
|
|
|
// podIsTerminated returns true if pod is in the terminated state ("Failed" or "Succeeded").
|
|
func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
|
|
// Check the cached pod status which was set after the last sync.
|
|
status, ok := kl.statusManager.GetPodStatus(pod.UID)
|
|
if !ok {
|
|
// If there is no cached status, use the status from the
|
|
// apiserver. This is useful if kubelet has recently been
|
|
// restarted.
|
|
status = pod.Status
|
|
}
|
|
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
|
}
|
|
|
|
// IsPodTerminated returns trus if the pod with the provided UID is in a terminated state ("Failed" or "Succeeded")
|
|
// or if the pod has been deleted or removed
|
|
func (kl *Kubelet) IsPodTerminated(uid types.UID) bool {
|
|
pod, podFound := kl.podManager.GetPodByUID(uid)
|
|
if !podFound {
|
|
return true
|
|
}
|
|
return kl.podIsTerminated(pod)
|
|
}
|
|
|
|
// IsPodDeleted returns true if the pod is deleted. For the pod to be deleted, either:
|
|
// 1. The pod object is deleted
|
|
// 2. The pod's status is evicted
|
|
// 3. The pod's deletion timestamp is set, and containers are not running
|
|
func (kl *Kubelet) IsPodDeleted(uid types.UID) bool {
|
|
pod, podFound := kl.podManager.GetPodByUID(uid)
|
|
if !podFound {
|
|
return true
|
|
}
|
|
status, statusFound := kl.statusManager.GetPodStatus(pod.UID)
|
|
if !statusFound {
|
|
status = pod.Status
|
|
}
|
|
return eviction.PodIsEvicted(status) || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
|
|
}
|
|
|
|
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
|
|
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
|
|
func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool {
|
|
if !notRunning(status.ContainerStatuses) {
|
|
// We shouldnt delete pods that still have running containers
|
|
glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod))
|
|
return false
|
|
}
|
|
// pod's containers should be deleted
|
|
runtimeStatus, err := kl.podCache.Get(pod.UID)
|
|
if err != nil {
|
|
glog.V(3).Infof("Pod %q is terminated, Error getting runtimeStatus from the podCache: %s", format.Pod(pod), err)
|
|
return false
|
|
}
|
|
if len(runtimeStatus.ContainerStatuses) > 0 {
|
|
glog.V(3).Infof("Pod %q is terminated, but some containers have not been cleaned up: %+v", format.Pod(pod), runtimeStatus.ContainerStatuses)
|
|
return false
|
|
}
|
|
if kl.podVolumesExist(pod.UID) && !kl.keepTerminatedPodVolumes {
|
|
// We shouldnt delete pods whose volumes have not been cleaned up if we are not keeping terminated pod volumes
|
|
glog.V(3).Infof("Pod %q is terminated, but some volumes have not been cleaned up", format.Pod(pod))
|
|
return false
|
|
}
|
|
if kl.kubeletConfiguration.CgroupsPerQOS {
|
|
pcm := kl.containerManager.NewPodContainerManager()
|
|
if pcm.Exists(pod) {
|
|
glog.V(3).Infof("Pod %q is terminated, but pod cgroup sandbox has not been cleaned up", format.Pod(pod))
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
|
|
func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool {
|
|
status, ok := kl.statusManager.GetPodStatus(pod.UID)
|
|
if !ok {
|
|
status = pod.Status
|
|
}
|
|
return kl.PodResourcesAreReclaimed(pod, status)
|
|
}
|
|
|
|
// notRunning returns true if every status is terminated or waiting, or the status list
|
|
// is empty.
|
|
func notRunning(statuses []v1.ContainerStatus) bool {
|
|
for _, status := range statuses {
|
|
if status.State.Terminated == nil && status.State.Waiting == nil {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// filterOutTerminatedPods returns the given pods which the status manager
|
|
// does not consider failed or succeeded.
|
|
func (kl *Kubelet) filterOutTerminatedPods(pods []*v1.Pod) []*v1.Pod {
|
|
var filteredPods []*v1.Pod
|
|
for _, p := range pods {
|
|
if kl.podIsTerminated(p) {
|
|
continue
|
|
}
|
|
filteredPods = append(filteredPods, p)
|
|
}
|
|
return filteredPods
|
|
}
|
|
|
|
// removeOrphanedPodStatuses removes obsolete entries in podStatus where
|
|
// the pod is no longer considered bound to this node.
|
|
func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Pod) {
|
|
podUIDs := make(map[types.UID]bool)
|
|
for _, pod := range pods {
|
|
podUIDs[pod.UID] = true
|
|
}
|
|
for _, pod := range mirrorPods {
|
|
podUIDs[pod.UID] = true
|
|
}
|
|
kl.statusManager.RemoveOrphanedStatuses(podUIDs)
|
|
}
|
|
|
|
// HandlePodCleanups performs a series of cleanup work, including terminating
|
|
// pod workers, killing unwanted pods, and removing orphaned volumes/pod
|
|
// directories.
|
|
// NOTE: This function is executed by the main sync loop, so it
|
|
// should not contain any blocking calls.
|
|
func (kl *Kubelet) HandlePodCleanups() error {
|
|
// The kubelet lacks checkpointing, so we need to introspect the set of pods
|
|
// in the cgroup tree prior to inspecting the set of pods in our pod manager.
|
|
// this ensures our view of the cgroup tree does not mistakenly observe pods
|
|
// that are added after the fact...
|
|
var (
|
|
cgroupPods map[types.UID]cm.CgroupName
|
|
err error
|
|
)
|
|
if kl.cgroupsPerQOS {
|
|
pcm := kl.containerManager.NewPodContainerManager()
|
|
cgroupPods, err = pcm.GetAllPodsFromCgroups()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err)
|
|
}
|
|
}
|
|
|
|
allPods, mirrorPods := kl.podManager.GetPodsAndMirrorPods()
|
|
// Pod phase progresses monotonically. Once a pod has reached a final state,
|
|
// it should never leave regardless of the restart policy. The statuses
|
|
// of such pods should not be changed, and there is no need to sync them.
|
|
// TODO: the logic here does not handle two cases:
|
|
// 1. If the containers were removed immediately after they died, kubelet
|
|
// may fail to generate correct statuses, let alone filtering correctly.
|
|
// 2. If kubelet restarted before writing the terminated status for a pod
|
|
// to the apiserver, it could still restart the terminated pod (even
|
|
// though the pod was not considered terminated by the apiserver).
|
|
// These two conditions could be alleviated by checkpointing kubelet.
|
|
activePods := kl.filterOutTerminatedPods(allPods)
|
|
|
|
desiredPods := make(map[types.UID]empty)
|
|
for _, pod := range activePods {
|
|
desiredPods[pod.UID] = empty{}
|
|
}
|
|
// Stop the workers for no-longer existing pods.
|
|
// TODO: is here the best place to forget pod workers?
|
|
kl.podWorkers.ForgetNonExistingPodWorkers(desiredPods)
|
|
kl.probeManager.CleanupPods(activePods)
|
|
|
|
runningPods, err := kl.runtimeCache.GetPods()
|
|
if err != nil {
|
|
glog.Errorf("Error listing containers: %#v", err)
|
|
return err
|
|
}
|
|
for _, pod := range runningPods {
|
|
if _, found := desiredPods[pod.ID]; !found {
|
|
kl.podKillingCh <- &kubecontainer.PodPair{APIPod: nil, RunningPod: pod}
|
|
}
|
|
}
|
|
|
|
kl.removeOrphanedPodStatuses(allPods, mirrorPods)
|
|
// Note that we just killed the unwanted pods. This may not have reflected
|
|
// in the cache. We need to bypass the cache to get the latest set of
|
|
// running pods to clean up the volumes.
|
|
// TODO: Evaluate the performance impact of bypassing the runtime cache.
|
|
runningPods, err = kl.containerRuntime.GetPods(false)
|
|
if err != nil {
|
|
glog.Errorf("Error listing containers: %#v", err)
|
|
return err
|
|
}
|
|
|
|
// Remove any orphaned volumes.
|
|
// Note that we pass all pods (including terminated pods) to the function,
|
|
// so that we don't remove volumes associated with terminated but not yet
|
|
// deleted pods.
|
|
err = kl.cleanupOrphanedPodDirs(allPods, runningPods)
|
|
if err != nil {
|
|
// We want all cleanup tasks to be run even if one of them failed. So
|
|
// we just log an error here and continue other cleanup tasks.
|
|
// This also applies to the other clean up tasks.
|
|
glog.Errorf("Failed cleaning up orphaned pod directories: %v", err)
|
|
}
|
|
|
|
// Remove any orphaned mirror pods.
|
|
kl.podManager.DeleteOrphanedMirrorPods()
|
|
|
|
// Remove any cgroups in the hierarchy for pods that are no longer running.
|
|
if kl.cgroupsPerQOS {
|
|
kl.cleanupOrphanedPodCgroups(cgroupPods, activePods)
|
|
}
|
|
|
|
kl.backOff.GC()
|
|
return nil
|
|
}
|
|
|
|
// podKiller launches a goroutine to kill a pod received from the channel if
|
|
// another goroutine isn't already in action.
|
|
func (kl *Kubelet) podKiller() {
|
|
killing := sets.NewString()
|
|
// guard for the killing set
|
|
lock := sync.Mutex{}
|
|
for {
|
|
select {
|
|
case podPair, ok := <-kl.podKillingCh:
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
runningPod := podPair.RunningPod
|
|
apiPod := podPair.APIPod
|
|
|
|
lock.Lock()
|
|
exists := killing.Has(string(runningPod.ID))
|
|
if !exists {
|
|
killing.Insert(string(runningPod.ID))
|
|
}
|
|
lock.Unlock()
|
|
|
|
if !exists {
|
|
go func(apiPod *v1.Pod, runningPod *kubecontainer.Pod) {
|
|
glog.V(2).Infof("Killing unwanted pod %q", runningPod.Name)
|
|
err := kl.killPod(apiPod, runningPod, nil, nil)
|
|
if err != nil {
|
|
glog.Errorf("Failed killing the pod %q: %v", runningPod.Name, err)
|
|
}
|
|
lock.Lock()
|
|
killing.Delete(string(runningPod.ID))
|
|
lock.Unlock()
|
|
}(apiPod, runningPod)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// hasHostPortConflicts detects pods with conflicted host ports.
|
|
func hasHostPortConflicts(pods []*v1.Pod) bool {
|
|
ports := sets.String{}
|
|
for _, pod := range pods {
|
|
if errs := validation.AccumulateUniqueHostPorts(pod.Spec.Containers, &ports, field.NewPath("spec", "containers")); len(errs) > 0 {
|
|
glog.Errorf("Pod %q: HostPort is already allocated, ignoring: %v", format.Pod(pod), errs)
|
|
return true
|
|
}
|
|
if errs := validation.AccumulateUniqueHostPorts(pod.Spec.InitContainers, &ports, field.NewPath("spec", "initContainers")); len(errs) > 0 {
|
|
glog.Errorf("Pod %q: HostPort is already allocated, ignoring: %v", format.Pod(pod), errs)
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state
|
|
// of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current
|
|
// running container is preferred over a previous termination. If info about the container is not available then a specific
|
|
// error is returned to the end user.
|
|
func (kl *Kubelet) validateContainerLogStatus(podName string, podStatus *v1.PodStatus, containerName string, previous bool) (containerID kubecontainer.ContainerID, err error) {
|
|
var cID string
|
|
|
|
cStatus, found := podutil.GetContainerStatus(podStatus.ContainerStatuses, containerName)
|
|
// if not found, check the init containers
|
|
if !found {
|
|
cStatus, found = podutil.GetContainerStatus(podStatus.InitContainerStatuses, containerName)
|
|
}
|
|
if !found {
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is not available", containerName, podName)
|
|
}
|
|
lastState := cStatus.LastTerminationState
|
|
waiting, running, terminated := cStatus.State.Waiting, cStatus.State.Running, cStatus.State.Terminated
|
|
|
|
switch {
|
|
case previous:
|
|
if lastState.Terminated == nil {
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("previous terminated container %q in pod %q not found", containerName, podName)
|
|
}
|
|
cID = lastState.Terminated.ContainerID
|
|
|
|
case running != nil:
|
|
cID = cStatus.ContainerID
|
|
|
|
case terminated != nil:
|
|
cID = terminated.ContainerID
|
|
|
|
case lastState.Terminated != nil:
|
|
cID = lastState.Terminated.ContainerID
|
|
|
|
case waiting != nil:
|
|
// output some info for the most common pending failures
|
|
switch reason := waiting.Reason; reason {
|
|
case images.ErrImagePull.Error():
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: image can't be pulled", containerName, podName)
|
|
case images.ErrImagePullBackOff.Error():
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: trying and failing to pull image", containerName, podName)
|
|
default:
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: %v", containerName, podName, reason)
|
|
}
|
|
default:
|
|
// unrecognized state
|
|
return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start - no logs yet", containerName, podName)
|
|
}
|
|
|
|
return kubecontainer.ParseContainerID(cID), nil
|
|
}
|
|
|
|
// GetKubeletContainerLogs returns logs from the container
|
|
// TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt
|
|
// or all of them.
|
|
func (kl *Kubelet) GetKubeletContainerLogs(podFullName, containerName string, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) error {
|
|
// Pod workers periodically write status to statusManager. If status is not
|
|
// cached there, something is wrong (or kubelet just restarted and hasn't
|
|
// caught up yet). Just assume the pod is not ready yet.
|
|
name, namespace, err := kubecontainer.ParsePodFullName(podFullName)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to parse pod full name %q: %v", podFullName, err)
|
|
}
|
|
|
|
pod, ok := kl.GetPodByName(namespace, name)
|
|
if !ok {
|
|
return fmt.Errorf("pod %q cannot be found - no logs available", name)
|
|
}
|
|
|
|
podUID := pod.UID
|
|
if mirrorPod, ok := kl.podManager.GetMirrorPodByPod(pod); ok {
|
|
podUID = mirrorPod.UID
|
|
}
|
|
podStatus, found := kl.statusManager.GetPodStatus(podUID)
|
|
if !found {
|
|
// If there is no cached status, use the status from the
|
|
// apiserver. This is useful if kubelet has recently been
|
|
// restarted.
|
|
podStatus = pod.Status
|
|
}
|
|
|
|
// TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID,
|
|
// but inside kuberuntime we convert container id back to container name and restart count.
|
|
// TODO: After separate container log lifecycle management, we should get log based on the existing log files
|
|
// instead of container status.
|
|
containerID, err := kl.validateContainerLogStatus(pod.Name, &podStatus, containerName, logOptions.Previous)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Do a zero-byte write to stdout before handing off to the container runtime.
|
|
// This ensures at least one Write call is made to the writer when copying starts,
|
|
// even if we then block waiting for log output from the container.
|
|
if _, err := stdout.Write([]byte{}); err != nil {
|
|
return err
|
|
}
|
|
|
|
if kl.dockerLegacyService != nil {
|
|
// dockerLegacyService should only be non-nil when we actually need it, so
|
|
// inject it into the runtimeService.
|
|
// TODO(random-liu): Remove this hack after deprecating unsupported log driver.
|
|
return kl.dockerLegacyService.GetContainerLogs(pod, containerID, logOptions, stdout, stderr)
|
|
}
|
|
return kl.containerRuntime.GetContainerLogs(pod, containerID, logOptions, stdout, stderr)
|
|
}
|
|
|
|
// GetPhase returns the phase of a pod given its container info.
|
|
// This func is exported to simplify integration with 3rd party kubelet
|
|
// integrations like kubernetes-mesos.
|
|
func GetPhase(spec *v1.PodSpec, info []v1.ContainerStatus) v1.PodPhase {
|
|
initialized := 0
|
|
pendingInitialization := 0
|
|
failedInitialization := 0
|
|
for _, container := range spec.InitContainers {
|
|
containerStatus, ok := podutil.GetContainerStatus(info, container.Name)
|
|
if !ok {
|
|
pendingInitialization++
|
|
continue
|
|
}
|
|
|
|
switch {
|
|
case containerStatus.State.Running != nil:
|
|
pendingInitialization++
|
|
case containerStatus.State.Terminated != nil:
|
|
if containerStatus.State.Terminated.ExitCode == 0 {
|
|
initialized++
|
|
} else {
|
|
failedInitialization++
|
|
}
|
|
case containerStatus.State.Waiting != nil:
|
|
if containerStatus.LastTerminationState.Terminated != nil {
|
|
if containerStatus.LastTerminationState.Terminated.ExitCode == 0 {
|
|
initialized++
|
|
} else {
|
|
failedInitialization++
|
|
}
|
|
} else {
|
|
pendingInitialization++
|
|
}
|
|
default:
|
|
pendingInitialization++
|
|
}
|
|
}
|
|
|
|
unknown := 0
|
|
running := 0
|
|
waiting := 0
|
|
stopped := 0
|
|
failed := 0
|
|
succeeded := 0
|
|
for _, container := range spec.Containers {
|
|
containerStatus, ok := podutil.GetContainerStatus(info, container.Name)
|
|
if !ok {
|
|
unknown++
|
|
continue
|
|
}
|
|
|
|
switch {
|
|
case containerStatus.State.Running != nil:
|
|
running++
|
|
case containerStatus.State.Terminated != nil:
|
|
stopped++
|
|
if containerStatus.State.Terminated.ExitCode == 0 {
|
|
succeeded++
|
|
} else {
|
|
failed++
|
|
}
|
|
case containerStatus.State.Waiting != nil:
|
|
if containerStatus.LastTerminationState.Terminated != nil {
|
|
stopped++
|
|
} else {
|
|
waiting++
|
|
}
|
|
default:
|
|
unknown++
|
|
}
|
|
}
|
|
|
|
if failedInitialization > 0 && spec.RestartPolicy == v1.RestartPolicyNever {
|
|
return v1.PodFailed
|
|
}
|
|
|
|
switch {
|
|
case pendingInitialization > 0:
|
|
fallthrough
|
|
case waiting > 0:
|
|
glog.V(5).Infof("pod waiting > 0, pending")
|
|
// One or more containers has not been started
|
|
return v1.PodPending
|
|
case running > 0 && unknown == 0:
|
|
// All containers have been started, and at least
|
|
// one container is running
|
|
return v1.PodRunning
|
|
case running == 0 && stopped > 0 && unknown == 0:
|
|
// All containers are terminated
|
|
if spec.RestartPolicy == v1.RestartPolicyAlways {
|
|
// All containers are in the process of restarting
|
|
return v1.PodRunning
|
|
}
|
|
if stopped == succeeded {
|
|
// RestartPolicy is not Always, and all
|
|
// containers are terminated in success
|
|
return v1.PodSucceeded
|
|
}
|
|
if spec.RestartPolicy == v1.RestartPolicyNever {
|
|
// RestartPolicy is Never, and all containers are
|
|
// terminated with at least one in failure
|
|
return v1.PodFailed
|
|
}
|
|
// RestartPolicy is OnFailure, and at least one in failure
|
|
// and in the process of restarting
|
|
return v1.PodRunning
|
|
default:
|
|
glog.V(5).Infof("pod default case, pending")
|
|
return v1.PodPending
|
|
}
|
|
}
|
|
|
|
// generateAPIPodStatus creates the final API pod status for a pod, given the
|
|
// internal pod status.
|
|
func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus {
|
|
glog.V(3).Infof("Generating status for %q", format.Pod(pod))
|
|
|
|
// check if an internal module has requested the pod is evicted.
|
|
for _, podSyncHandler := range kl.PodSyncHandlers {
|
|
if result := podSyncHandler.ShouldEvict(pod); result.Evict {
|
|
return v1.PodStatus{
|
|
Phase: v1.PodFailed,
|
|
Reason: result.Reason,
|
|
Message: result.Message,
|
|
}
|
|
}
|
|
}
|
|
|
|
s := kl.convertStatusToAPIStatus(pod, podStatus)
|
|
|
|
// Assume info is ready to process
|
|
spec := &pod.Spec
|
|
allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...)
|
|
s.Phase = GetPhase(spec, allStatus)
|
|
kl.probeManager.UpdatePodStatus(pod.UID, s)
|
|
s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(spec, s.InitContainerStatuses, s.Phase))
|
|
s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(spec, s.ContainerStatuses, s.Phase))
|
|
// s (the PodStatus we are creating) will not have a PodScheduled condition yet, because converStatusToAPIStatus()
|
|
// does not create one. If the existing PodStatus has a PodScheduled condition, then copy it into s and make sure
|
|
// it is set to true. If the existing PodStatus does not have a PodScheduled condition, then create one that is set to true.
|
|
if _, oldPodScheduled := podutil.GetPodCondition(&pod.Status, v1.PodScheduled); oldPodScheduled != nil {
|
|
s.Conditions = append(s.Conditions, *oldPodScheduled)
|
|
}
|
|
podutil.UpdatePodCondition(&pod.Status, &v1.PodCondition{
|
|
Type: v1.PodScheduled,
|
|
Status: v1.ConditionTrue,
|
|
})
|
|
|
|
if kl.kubeClient != nil {
|
|
hostIP, err := kl.getHostIPAnyWay()
|
|
if err != nil {
|
|
glog.V(4).Infof("Cannot get host IP: %v", err)
|
|
} else {
|
|
s.HostIP = hostIP.String()
|
|
if kubecontainer.IsHostNetworkPod(pod) && s.PodIP == "" {
|
|
s.PodIP = hostIP.String()
|
|
}
|
|
}
|
|
}
|
|
|
|
return *s
|
|
}
|
|
|
|
// convertStatusToAPIStatus creates an api PodStatus for the given pod from
|
|
// the given internal pod status. It is purely transformative and does not
|
|
// alter the kubelet state at all.
|
|
func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *v1.PodStatus {
|
|
var apiPodStatus v1.PodStatus
|
|
apiPodStatus.PodIP = podStatus.IP
|
|
// set status for Pods created on versions of kube older than 1.6
|
|
apiPodStatus.QOSClass = v1qos.GetPodQOS(pod)
|
|
|
|
oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID)
|
|
if !found {
|
|
oldPodStatus = pod.Status
|
|
}
|
|
|
|
apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses(
|
|
pod, podStatus,
|
|
oldPodStatus.ContainerStatuses,
|
|
pod.Spec.Containers,
|
|
len(pod.Spec.InitContainers) > 0,
|
|
false,
|
|
)
|
|
apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses(
|
|
pod, podStatus,
|
|
oldPodStatus.InitContainerStatuses,
|
|
pod.Spec.InitContainers,
|
|
len(pod.Spec.InitContainers) > 0,
|
|
true,
|
|
)
|
|
|
|
return &apiPodStatus
|
|
}
|
|
|
|
// convertToAPIContainerStatuses converts the given internal container
|
|
// statuses into API container statuses.
|
|
func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus {
|
|
convertContainerStatus := func(cs *kubecontainer.ContainerStatus) *v1.ContainerStatus {
|
|
cid := cs.ID.String()
|
|
status := &v1.ContainerStatus{
|
|
Name: cs.Name,
|
|
RestartCount: int32(cs.RestartCount),
|
|
Image: cs.Image,
|
|
ImageID: cs.ImageID,
|
|
ContainerID: cid,
|
|
}
|
|
switch cs.State {
|
|
case kubecontainer.ContainerStateRunning:
|
|
status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
|
|
case kubecontainer.ContainerStateCreated:
|
|
// Treat containers in the "created" state as if they are exited.
|
|
// The pod workers are supposed start all containers it creates in
|
|
// one sync (syncPod) iteration. There should not be any normal
|
|
// "created" containers when the pod worker generates the status at
|
|
// the beginning of a sync iteration.
|
|
fallthrough
|
|
case kubecontainer.ContainerStateExited:
|
|
status.State.Terminated = &v1.ContainerStateTerminated{
|
|
ExitCode: int32(cs.ExitCode),
|
|
Reason: cs.Reason,
|
|
Message: cs.Message,
|
|
StartedAt: metav1.NewTime(cs.StartedAt),
|
|
FinishedAt: metav1.NewTime(cs.FinishedAt),
|
|
ContainerID: cid,
|
|
}
|
|
default:
|
|
status.State.Waiting = &v1.ContainerStateWaiting{}
|
|
}
|
|
return status
|
|
}
|
|
|
|
// Fetch old containers statuses from old pod status.
|
|
oldStatuses := make(map[string]v1.ContainerStatus, len(containers))
|
|
for _, status := range previousStatus {
|
|
oldStatuses[status.Name] = status
|
|
}
|
|
|
|
// Set all container statuses to default waiting state
|
|
statuses := make(map[string]*v1.ContainerStatus, len(containers))
|
|
defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: "ContainerCreating"}}
|
|
if hasInitContainers {
|
|
defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: "PodInitializing"}}
|
|
}
|
|
|
|
for _, container := range containers {
|
|
status := &v1.ContainerStatus{
|
|
Name: container.Name,
|
|
Image: container.Image,
|
|
State: defaultWaitingState,
|
|
}
|
|
oldStatus, found := oldStatuses[container.Name]
|
|
if found {
|
|
if oldStatus.State.Terminated != nil {
|
|
// Do not update status on terminated init containers as
|
|
// they be removed at any time.
|
|
status = &oldStatus
|
|
} else {
|
|
// Apply some values from the old statuses as the default values.
|
|
status.RestartCount = oldStatus.RestartCount
|
|
status.LastTerminationState = oldStatus.LastTerminationState
|
|
}
|
|
}
|
|
statuses[container.Name] = status
|
|
}
|
|
|
|
// Make the latest container status comes first.
|
|
sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(podStatus.ContainerStatuses)))
|
|
// Set container statuses according to the statuses seen in pod status
|
|
containerSeen := map[string]int{}
|
|
for _, cStatus := range podStatus.ContainerStatuses {
|
|
cName := cStatus.Name
|
|
if _, ok := statuses[cName]; !ok {
|
|
// This would also ignore the infra container.
|
|
continue
|
|
}
|
|
if containerSeen[cName] >= 2 {
|
|
continue
|
|
}
|
|
status := convertContainerStatus(cStatus)
|
|
if containerSeen[cName] == 0 {
|
|
statuses[cName] = status
|
|
} else {
|
|
statuses[cName].LastTerminationState = status.State
|
|
}
|
|
containerSeen[cName] = containerSeen[cName] + 1
|
|
}
|
|
|
|
// Handle the containers failed to be started, which should be in Waiting state.
|
|
for _, container := range containers {
|
|
if isInitContainer {
|
|
// If the init container is terminated with exit code 0, it won't be restarted.
|
|
// TODO(random-liu): Handle this in a cleaner way.
|
|
s := podStatus.FindContainerStatusByName(container.Name)
|
|
if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 {
|
|
continue
|
|
}
|
|
}
|
|
// If a container should be restarted in next syncpod, it is *Waiting*.
|
|
if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
|
|
continue
|
|
}
|
|
status := statuses[container.Name]
|
|
reason, ok := kl.reasonCache.Get(pod.UID, container.Name)
|
|
if !ok {
|
|
// In fact, we could also apply Waiting state here, but it is less informative,
|
|
// and the container will be restarted soon, so we prefer the original state here.
|
|
// Note that with the current implementation of ShouldContainerBeRestarted the original state here
|
|
// could be:
|
|
// * Waiting: There is no associated historical container and start failure reason record.
|
|
// * Terminated: The container is terminated.
|
|
continue
|
|
}
|
|
if status.State.Terminated != nil {
|
|
status.LastTerminationState = status.State
|
|
}
|
|
status.State = v1.ContainerState{
|
|
Waiting: &v1.ContainerStateWaiting{
|
|
Reason: reason.Err.Error(),
|
|
Message: reason.Message,
|
|
},
|
|
}
|
|
statuses[container.Name] = status
|
|
}
|
|
|
|
var containerStatuses []v1.ContainerStatus
|
|
for _, status := range statuses {
|
|
containerStatuses = append(containerStatuses, *status)
|
|
}
|
|
|
|
// Sort the container statuses since clients of this interface expect the list
|
|
// of containers in a pod has a deterministic order.
|
|
if isInitContainer {
|
|
kubetypes.SortInitContainerStatuses(pod, containerStatuses)
|
|
} else {
|
|
sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses))
|
|
}
|
|
return containerStatuses
|
|
}
|
|
|
|
// ServeLogs returns logs of current machine.
|
|
func (kl *Kubelet) ServeLogs(w http.ResponseWriter, req *http.Request) {
|
|
// TODO: whitelist logs we are willing to serve
|
|
kl.logServer.ServeHTTP(w, req)
|
|
}
|
|
|
|
// findContainer finds and returns the container with the given pod ID, full name, and container name.
|
|
// It returns nil if not found.
|
|
func (kl *Kubelet) findContainer(podFullName string, podUID types.UID, containerName string) (*kubecontainer.Container, error) {
|
|
pods, err := kl.containerRuntime.GetPods(false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Resolve and type convert back again.
|
|
// We need the static pod UID but the kubecontainer API works with types.UID.
|
|
podUID = types.UID(kl.podManager.TranslatePodUID(podUID))
|
|
pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID)
|
|
return pod.FindContainerByName(containerName), nil
|
|
}
|
|
|
|
// RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes
|
|
func (kl *Kubelet) RunInContainer(podFullName string, podUID types.UID, containerName string, cmd []string) ([]byte, error) {
|
|
container, err := kl.findContainer(podFullName, podUID, containerName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if container == nil {
|
|
return nil, fmt.Errorf("container not found (%q)", containerName)
|
|
}
|
|
// TODO(tallclair): Pass a proper timeout value.
|
|
return kl.runner.RunInContainer(container.ID, cmd, 0)
|
|
}
|
|
|
|
// ExecInContainer executes a command in a container, connecting the supplied
|
|
// stdin/stdout/stderr to the command's IO streams.
|
|
func (kl *Kubelet) ExecInContainer(podFullName string, podUID types.UID, containerName string, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error {
|
|
streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime)
|
|
if !ok {
|
|
return fmt.Errorf("streaming methods not supported by runtime")
|
|
}
|
|
|
|
container, err := kl.findContainer(podFullName, podUID, containerName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if container == nil {
|
|
return fmt.Errorf("container not found (%q)", containerName)
|
|
}
|
|
return streamingRuntime.ExecInContainer(container.ID, cmd, stdin, stdout, stderr, tty, resize, timeout)
|
|
}
|
|
|
|
// AttachContainer uses the container runtime to attach the given streams to
|
|
// the given container.
|
|
func (kl *Kubelet) AttachContainer(podFullName string, podUID types.UID, containerName string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize) error {
|
|
streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime)
|
|
if !ok {
|
|
return fmt.Errorf("streaming methods not supported by runtime")
|
|
}
|
|
|
|
container, err := kl.findContainer(podFullName, podUID, containerName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if container == nil {
|
|
return fmt.Errorf("container not found (%q)", containerName)
|
|
}
|
|
return streamingRuntime.AttachContainer(container.ID, stdin, stdout, stderr, tty, resize)
|
|
}
|
|
|
|
// PortForward connects to the pod's port and copies data between the port
|
|
// and the stream.
|
|
func (kl *Kubelet) PortForward(podFullName string, podUID types.UID, port int32, stream io.ReadWriteCloser) error {
|
|
streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime)
|
|
if !ok {
|
|
return fmt.Errorf("streaming methods not supported by runtime")
|
|
}
|
|
|
|
pods, err := kl.containerRuntime.GetPods(false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Resolve and type convert back again.
|
|
// We need the static pod UID but the kubecontainer API works with types.UID.
|
|
podUID = types.UID(kl.podManager.TranslatePodUID(podUID))
|
|
pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID)
|
|
if pod.IsEmpty() {
|
|
return fmt.Errorf("pod not found (%q)", podFullName)
|
|
}
|
|
return streamingRuntime.PortForward(&pod, port, stream)
|
|
}
|
|
|
|
// GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it.
|
|
func (kl *Kubelet) GetExec(podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommandserver.Options) (*url.URL, error) {
|
|
switch streamingRuntime := kl.containerRuntime.(type) {
|
|
case kubecontainer.DirectStreamingRuntime:
|
|
// Kubelet will serve the exec directly.
|
|
return nil, nil
|
|
case kubecontainer.IndirectStreamingRuntime:
|
|
container, err := kl.findContainer(podFullName, podUID, containerName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if container == nil {
|
|
return nil, fmt.Errorf("container not found (%q)", containerName)
|
|
}
|
|
return streamingRuntime.GetExec(container.ID, cmd, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, streamOpts.TTY)
|
|
default:
|
|
return nil, fmt.Errorf("container runtime does not support exec")
|
|
}
|
|
}
|
|
|
|
// GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it.
|
|
func (kl *Kubelet) GetAttach(podFullName string, podUID types.UID, containerName string, streamOpts remotecommandserver.Options) (*url.URL, error) {
|
|
switch streamingRuntime := kl.containerRuntime.(type) {
|
|
case kubecontainer.DirectStreamingRuntime:
|
|
// Kubelet will serve the attach directly.
|
|
return nil, nil
|
|
case kubecontainer.IndirectStreamingRuntime:
|
|
container, err := kl.findContainer(podFullName, podUID, containerName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if container == nil {
|
|
return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName)
|
|
}
|
|
|
|
// The TTY setting for attach must match the TTY setting in the initial container configuration,
|
|
// since whether the process is running in a TTY cannot be changed after it has started. We
|
|
// need the api.Pod to get the TTY status.
|
|
pod, found := kl.GetPodByFullName(podFullName)
|
|
if !found || (string(podUID) != "" && pod.UID != podUID) {
|
|
return nil, fmt.Errorf("pod %s not found", podFullName)
|
|
}
|
|
containerSpec := kubecontainer.GetContainerSpec(pod, containerName)
|
|
if containerSpec == nil {
|
|
return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName)
|
|
}
|
|
tty := containerSpec.TTY
|
|
|
|
return streamingRuntime.GetAttach(container.ID, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, tty)
|
|
default:
|
|
return nil, fmt.Errorf("container runtime does not support attach")
|
|
}
|
|
}
|
|
|
|
// GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it.
|
|
func (kl *Kubelet) GetPortForward(podName, podNamespace string, podUID types.UID, portForwardOpts portforward.V4Options) (*url.URL, error) {
|
|
switch streamingRuntime := kl.containerRuntime.(type) {
|
|
case kubecontainer.DirectStreamingRuntime:
|
|
// Kubelet will serve the attach directly.
|
|
return nil, nil
|
|
case kubecontainer.IndirectStreamingRuntime:
|
|
pods, err := kl.containerRuntime.GetPods(false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Resolve and type convert back again.
|
|
// We need the static pod UID but the kubecontainer API works with types.UID.
|
|
podUID = types.UID(kl.podManager.TranslatePodUID(podUID))
|
|
podFullName := kubecontainer.BuildPodFullName(podName, podNamespace)
|
|
pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID)
|
|
if pod.IsEmpty() {
|
|
return nil, fmt.Errorf("pod not found (%q)", podFullName)
|
|
}
|
|
|
|
return streamingRuntime.GetPortForward(podName, podNamespace, podUID, portForwardOpts.Ports)
|
|
default:
|
|
return nil, fmt.Errorf("container runtime does not support port-forward")
|
|
}
|
|
}
|
|
|
|
// cleanupOrphanedPodCgroups removes cgroups that should no longer exist.
|
|
// it reconciles the cached state of cgroupPods with the specified list of runningPods
|
|
func (kl *Kubelet) cleanupOrphanedPodCgroups(cgroupPods map[types.UID]cm.CgroupName, activePods []*v1.Pod) {
|
|
// Add all running pods to the set that we want to preserve
|
|
podSet := sets.NewString()
|
|
for _, pod := range activePods {
|
|
podSet.Insert(string(pod.UID))
|
|
}
|
|
pcm := kl.containerManager.NewPodContainerManager()
|
|
|
|
// Iterate over all the found pods to verify if they should be running
|
|
for uid, val := range cgroupPods {
|
|
// if the pod is in the running set, its not a candidate for cleanup
|
|
if podSet.Has(string(uid)) {
|
|
continue
|
|
}
|
|
|
|
// If volumes have not been unmounted/detached, do not delete the cgroup
|
|
// so any memory backed volumes don't have their charges propagated to the
|
|
// parent croup. If the volumes still exist, reduce the cpu shares for any
|
|
// process in the cgroup to the minimum value while we wait. if the kubelet
|
|
// is configured to keep terminated volumes, we will delete the cgroup and not block.
|
|
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.keepTerminatedPodVolumes {
|
|
glog.V(3).Infof("Orphaned pod %q found, but volumes not yet removed. Reducing cpu to minimum", uid)
|
|
if err := pcm.ReduceCPULimits(val); err != nil {
|
|
glog.Warningf("Failed to reduce cpu time for pod %q pending volume cleanup due to %v", uid, err)
|
|
}
|
|
continue
|
|
}
|
|
glog.V(3).Infof("Orphaned pod %q found, removing pod cgroups", uid)
|
|
// Destroy all cgroups of pod that should not be running,
|
|
// by first killing all the attached processes to these cgroups.
|
|
// We ignore errors thrown by the method, as the housekeeping loop would
|
|
// again try to delete these unwanted pod cgroups
|
|
go pcm.Destroy(val)
|
|
}
|
|
}
|
|
|
|
// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
|
|
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
|
|
// capability, the pod contains a privileged container, or the pod has a host path volume.
|
|
//
|
|
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
|
|
// or it will not have the correct capabilities in the namespace. This means that host user namespace
|
|
// is enabled per pod, not per container.
|
|
func (kl *Kubelet) enableHostUserNamespace(pod *v1.Pod) bool {
|
|
if kubecontainer.HasPrivilegedContainer(pod) || hasHostNamespace(pod) ||
|
|
hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
|
|
func hasNonNamespacedCapability(pod *v1.Pod) bool {
|
|
for _, c := range pod.Spec.Containers {
|
|
if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil {
|
|
for _, cap := range c.SecurityContext.Capabilities.Add {
|
|
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// hasHostVolume returns true if the pod spec has a HostPath volume.
|
|
func hasHostVolume(pod *v1.Pod) bool {
|
|
for _, v := range pod.Spec.Volumes {
|
|
if v.HostPath != nil {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
|
|
func hasHostNamespace(pod *v1.Pod) bool {
|
|
if pod.Spec.SecurityContext == nil {
|
|
return false
|
|
}
|
|
return pod.Spec.HostIPC || pod.Spec.HostNetwork || pod.Spec.HostPID
|
|
}
|
|
|
|
// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
|
|
func (kl *Kubelet) hasHostMountPVC(pod *v1.Pod) bool {
|
|
for _, volume := range pod.Spec.Volumes {
|
|
if volume.PersistentVolumeClaim != nil {
|
|
pvc, err := kl.kubeClient.CoreV1().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName, metav1.GetOptions{})
|
|
if err != nil {
|
|
glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err)
|
|
continue
|
|
}
|
|
if pvc != nil {
|
|
referencedVolume, err := kl.kubeClient.CoreV1().PersistentVolumes().Get(pvc.Spec.VolumeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
glog.Warningf("unable to retrieve pv %s - %v", pvc.Spec.VolumeName, err)
|
|
continue
|
|
}
|
|
if referencedVolume != nil && referencedVolume.Spec.HostPath != nil {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|