/* Copyright 2016 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package kubelet import ( "bytes" "fmt" "io" "io/ioutil" "net/http" "net/url" "os" "path" "path/filepath" "runtime" "sort" "strings" "sync" "time" "github.com/golang/glog" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" utilvalidation "k8s.io/apimachinery/pkg/util/validation" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/tools/remotecommand" podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/api/v1/resource" podshelper "k8s.io/kubernetes/pkg/apis/core/pods" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/fieldpath" runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/envvars" "k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/server/portforward" remotecommandserver "k8s.io/kubernetes/pkg/kubelet/server/remotecommand" "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" utilfile "k8s.io/kubernetes/pkg/util/file" volumeutil "k8s.io/kubernetes/pkg/volume/util" "k8s.io/kubernetes/pkg/volume/util/volumepathhandler" volumevalidation "k8s.io/kubernetes/pkg/volume/validation" "k8s.io/kubernetes/third_party/forked/golang/expansion" ) // Get a list of pods that have data directories. func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { podInfos, err := ioutil.ReadDir(kl.getPodsDir()) if err != nil { return nil, err } pods := []types.UID{} for i := range podInfos { if podInfos[i].IsDir() { pods = append(pods, types.UID(podInfos[i].Name())) } } return pods, nil } // GetActivePods returns non-terminal pods func (kl *Kubelet) GetActivePods() []*v1.Pod { allPods := kl.podManager.GetPods() activePods := kl.filterOutTerminatedPods(allPods) return activePods } // makeGPUDevices determines the devices for the given container. // Experimental. func (kl *Kubelet) makeGPUDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) { if container.Resources.Limits.NvidiaGPU().IsZero() { return nil, nil } nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container) if err != nil { return nil, err } var devices []kubecontainer.DeviceInfo for _, path := range nvidiaGPUPaths { // Devices have to be mapped one to one because of nvidia CUDA library requirements. devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"}) } return devices, nil } func makeAbsolutePath(goos, path string) string { if goos != "windows" { return "/" + path } // These are all for windows // If there is a colon, give up. if strings.Contains(path, ":") { return path } // If there is a slash, but no drive, add 'c:' if strings.HasPrefix(path, "/") || strings.HasPrefix(path, "\\") { return "c:" + path } // Otherwise, add 'c:\' return "c:\\" + path } // makeBlockVolumes maps the raw block devices specified in the path of the container // Experimental func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) { var devices []kubecontainer.DeviceInfo for _, device := range container.VolumeDevices { // check path is absolute if !filepath.IsAbs(device.DevicePath) { return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath) } vol, ok := podVolumes[device.Name] if !ok || vol.BlockVolumeMapper == nil { glog.Errorf("Block volume cannot be satisfied for container %q, because the volume is missing or the volume mapper is nil: %+v", container.Name, device) return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name) } // Get a symbolic link associated to a block device under pod device path dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath() symlinkPath := path.Join(dirPath, volName) if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil { return nil, checkErr } else if islinkExist { // Check readOnly in PVCVolumeSource and set read only permission if it's true. permission := "mrw" if vol.ReadOnly { permission = "r" } glog.V(4).Infof("Device will be attached to container %q. Path on host: %v", container.Name, symlinkPath) devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission}) } } return devices, nil } // makeMounts determines the mount points for the given container. func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain, podIP string, podVolumes kubecontainer.VolumeMap) ([]kubecontainer.Mount, error) { // Kubernetes only mounts on /etc/hosts if: // - container is not an infrastructure (pause) container // - container is not already mounting on /etc/hosts // - OS is not Windows // Kubernetes will not mount /etc/hosts if: // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set. mountEtcHostsFile := len(podIP) > 0 && runtime.GOOS != "windows" glog.V(3).Infof("container: %v/%v/%v podIP: %q creating hosts mount: %v", pod.Namespace, pod.Name, container.Name, podIP, mountEtcHostsFile) mounts := []kubecontainer.Mount{} for _, mount := range container.VolumeMounts { // do not mount /etc/hosts if container is already mounting on the path mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath) vol, ok := podVolumes[mount.Name] if !ok || vol.Mounter == nil { glog.Errorf("Mount cannot be satisfied for container %q, because the volume is missing or the volume mounter is nil: %+v", container.Name, mount) return nil, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name) } relabelVolume := false // If the volume supports SELinux and it has not been // relabeled already and it is not a read-only volume, // relabel it and mark it as labeled if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SupportsSELinux && !vol.SELinuxLabeled { vol.SELinuxLabeled = true relabelVolume = true } hostPath, err := volumeutil.GetPath(vol.Mounter) if err != nil { return nil, err } if mount.SubPath != "" { if filepath.IsAbs(mount.SubPath) { return nil, fmt.Errorf("error SubPath `%s` must not be an absolute path", mount.SubPath) } err = volumevalidation.ValidatePathNoBacksteps(mount.SubPath) if err != nil { return nil, fmt.Errorf("unable to provision SubPath `%s`: %v", mount.SubPath, err) } fileinfo, err := os.Lstat(hostPath) if err != nil { return nil, err } perm := fileinfo.Mode() hostPath = filepath.Join(hostPath, mount.SubPath) if subPathExists, err := utilfile.FileOrSymlinkExists(hostPath); err != nil { glog.Errorf("Could not determine if subPath %s exists; will not attempt to change its permissions", hostPath) } else if !subPathExists { // Create the sub path now because if it's auto-created later when referenced, it may have an // incorrect ownership and mode. For example, the sub path directory must have at least g+rwx // when the pod specifies an fsGroup, and if the directory is not created here, Docker will // later auto-create it with the incorrect mode 0750 if err := os.MkdirAll(hostPath, perm); err != nil { glog.Errorf("failed to mkdir:%s", hostPath) return nil, err } // chmod the sub path because umask may have prevented us from making the sub path with the same // permissions as the mounter path if err := os.Chmod(hostPath, perm); err != nil { return nil, err } } } // Docker Volume Mounts fail on Windows if it is not of the form C:/ containerPath := mount.MountPath if runtime.GOOS == "windows" { if (strings.HasPrefix(hostPath, "/") || strings.HasPrefix(hostPath, "\\")) && !strings.Contains(hostPath, ":") { hostPath = "c:" + hostPath } } if !filepath.IsAbs(containerPath) { containerPath = makeAbsolutePath(runtime.GOOS, containerPath) } propagation, err := translateMountPropagation(mount.MountPropagation) if err != nil { return nil, err } glog.V(5).Infof("Pod %q container %q mount %q has propagation %q", format.Pod(pod), container.Name, mount.Name, propagation) mustMountRO := vol.Mounter.GetAttributes().ReadOnly && utilfeature.DefaultFeatureGate.Enabled(features.ReadOnlyAPIDataVolumes) mounts = append(mounts, kubecontainer.Mount{ Name: mount.Name, ContainerPath: containerPath, HostPath: hostPath, ReadOnly: mount.ReadOnly || mustMountRO, SELinuxRelabel: relabelVolume, Propagation: propagation, }) } if mountEtcHostsFile { hostAliases := pod.Spec.HostAliases hostsMount, err := makeHostsMount(podDir, podIP, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork) if err != nil { return nil, err } mounts = append(mounts, *hostsMount) } return mounts, nil } // translateMountPropagation transforms v1.MountPropagationMode to // runtimeapi.MountPropagation. func translateMountPropagation(mountMode *v1.MountPropagationMode) (runtimeapi.MountPropagation, error) { if runtime.GOOS == "windows" { // Windows containers doesn't support mount propagation, use private for it. // Refer https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation. return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil } if !utilfeature.DefaultFeatureGate.Enabled(features.MountPropagation) { // mount propagation is disabled, use private as in the old versions return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil } switch { case mountMode == nil: // HostToContainer is the default return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil case *mountMode == v1.MountPropagationHostToContainer: return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil case *mountMode == v1.MountPropagationBidirectional: return runtimeapi.MountPropagation_PROPAGATION_BIDIRECTIONAL, nil default: return 0, fmt.Errorf("invalid MountPropagation mode: %q", mountMode) } } // makeHostsMount makes the mountpoint for the hosts file that the containers // in a pod are injected with. func makeHostsMount(podDir, podIP, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) (*kubecontainer.Mount, error) { hostsFilePath := path.Join(podDir, "etc-hosts") if err := ensureHostsFile(hostsFilePath, podIP, hostName, hostDomainName, hostAliases, useHostNetwork); err != nil { return nil, err } return &kubecontainer.Mount{ Name: "k8s-managed-etc-hosts", ContainerPath: etcHostsPath, HostPath: hostsFilePath, ReadOnly: false, SELinuxRelabel: true, }, nil } // ensureHostsFile ensures that the given host file has an up-to-date ip, host // name, and domain name. func ensureHostsFile(fileName, hostIP, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) error { var hostsFileContent []byte var err error if useHostNetwork { // if Pod is using host network, read hosts file from the node's filesystem. // `etcHostsPath` references the location of the hosts file on the node. // `/etc/hosts` for *nix systems. hostsFileContent, err = nodeHostsFileContent(etcHostsPath, hostAliases) if err != nil { return err } } else { // if Pod is not using host network, create a managed hosts file with Pod IP and other information. hostsFileContent = managedHostsFileContent(hostIP, hostName, hostDomainName, hostAliases) } return ioutil.WriteFile(fileName, hostsFileContent, 0644) } // nodeHostsFileContent reads the content of node's hosts file. func nodeHostsFileContent(hostsFilePath string, hostAliases []v1.HostAlias) ([]byte, error) { hostsFileContent, err := ioutil.ReadFile(hostsFilePath) if err != nil { return nil, err } hostsFileContent = append(hostsFileContent, hostsEntriesFromHostAliases(hostAliases)...) return hostsFileContent, nil } // managedHostsFileContent generates the content of the managed etc hosts based on Pod IP and other // information. func managedHostsFileContent(hostIP, hostName, hostDomainName string, hostAliases []v1.HostAlias) []byte { var buffer bytes.Buffer buffer.WriteString("# Kubernetes-managed hosts file.\n") buffer.WriteString("127.0.0.1\tlocalhost\n") // ipv4 localhost buffer.WriteString("::1\tlocalhost ip6-localhost ip6-loopback\n") // ipv6 localhost buffer.WriteString("fe00::0\tip6-localnet\n") buffer.WriteString("fe00::0\tip6-mcastprefix\n") buffer.WriteString("fe00::1\tip6-allnodes\n") buffer.WriteString("fe00::2\tip6-allrouters\n") if len(hostDomainName) > 0 { buffer.WriteString(fmt.Sprintf("%s\t%s.%s\t%s\n", hostIP, hostName, hostDomainName, hostName)) } else { buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostIP, hostName)) } hostsFileContent := buffer.Bytes() hostsFileContent = append(hostsFileContent, hostsEntriesFromHostAliases(hostAliases)...) return hostsFileContent } func hostsEntriesFromHostAliases(hostAliases []v1.HostAlias) []byte { if len(hostAliases) == 0 { return []byte{} } var buffer bytes.Buffer buffer.WriteString("\n") buffer.WriteString("# Entries added by HostAliases.\n") // write each IP/hostname pair as an entry into hosts file for _, hostAlias := range hostAliases { for _, hostname := range hostAlias.Hostnames { buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostAlias.IP, hostname)) } } return buffer.Bytes() } // truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars. func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) { // Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char). const hostnameMaxLen = 63 if len(hostname) <= hostnameMaxLen { return hostname, nil } truncated := hostname[:hostnameMaxLen] glog.Errorf("hostname for pod:%q was longer than %d. Truncated hostname to :%q", podName, hostnameMaxLen, truncated) // hostname should not end with '-' or '.' truncated = strings.TrimRight(truncated, "-.") if len(truncated) == 0 { // This should never happen. return "", fmt.Errorf("hostname for pod %q was invalid: %q", podName, hostname) } return truncated, nil } // GeneratePodHostNameAndDomain creates a hostname and domain name for a pod, // given that pod's spec and annotations or returns an error. func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) { // TODO(vmarmol): Handle better. clusterDomain := kl.dnsConfigurer.ClusterDomain hostname := pod.Name if len(pod.Spec.Hostname) > 0 { if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Hostname); len(msgs) != 0 { return "", "", fmt.Errorf("Pod Hostname %q is not a valid DNS label: %s", pod.Spec.Hostname, strings.Join(msgs, ";")) } hostname = pod.Spec.Hostname } hostname, err := truncatePodHostnameIfNeeded(pod.Name, hostname) if err != nil { return "", "", err } hostDomain := "" if len(pod.Spec.Subdomain) > 0 { if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Subdomain); len(msgs) != 0 { return "", "", fmt.Errorf("Pod Subdomain %q is not a valid DNS label: %s", pod.Spec.Subdomain, strings.Join(msgs, ";")) } hostDomain = fmt.Sprintf("%s.%s.svc.%s", pod.Spec.Subdomain, pod.Namespace, clusterDomain) } return hostname, hostDomain, nil } // GetPodCgroupParent gets pod cgroup parent from container manager. func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string { pcm := kl.containerManager.NewPodContainerManager() _, cgroupParent := pcm.GetPodContainerName(pod) return cgroupParent } // GenerateRunContainerOptions generates the RunContainerOptions, which can be used by // the container runtime to set parameters for launching a container. func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Container, podIP string) (*kubecontainer.RunContainerOptions, error) { opts, err := kl.containerManager.GetResources(pod, container) if err != nil { return nil, err } cgroupParent := kl.GetPodCgroupParent(pod) opts.CgroupParent = cgroupParent hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod) if err != nil { return nil, err } opts.Hostname = hostname podName := volumeutil.GetUniquePodName(pod) volumes := kl.volumeManager.GetMountedVolumesForPod(podName) opts.PortMappings = kubecontainer.MakePortMappings(container) // TODO(random-liu): Move following convert functions into pkg/kubelet/container devices, err := kl.makeGPUDevices(pod, container) if err != nil { return nil, err } opts.Devices = append(opts.Devices, devices...) // TODO: remove feature gate check after no longer needed if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) { blkutil := volumepathhandler.NewBlockVolumePathHandler() blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil) if err != nil { return nil, err } opts.Devices = append(opts.Devices, blkVolumes...) } mounts, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes) if err != nil { return nil, err } opts.Mounts = append(opts.Mounts, mounts...) envs, err := kl.makeEnvironmentVariables(pod, container, podIP) if err != nil { return nil, err } opts.Envs = append(opts.Envs, envs...) // Disabling adding TerminationMessagePath on Windows as these files would be mounted as docker volume and // Docker for Windows has a bug where only directories can be mounted if len(container.TerminationMessagePath) != 0 && runtime.GOOS != "windows" { p := kl.getPodContainerDir(pod.UID, container.Name) if err := os.MkdirAll(p, 0750); err != nil { glog.Errorf("Error on creating %q: %v", p, err) } else { opts.PodContainerDir = p } } // only do this check if the experimental behavior is enabled, otherwise allow it to default to false if kl.experimentalHostUserNamespaceDefaulting { opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod) } return opts, nil } var masterServices = sets.NewString("kubernetes") // getServiceEnvVarMap makes a map[string]string of env vars for services a // pod in namespace ns should see. func (kl *Kubelet) getServiceEnvVarMap(ns string) (map[string]string, error) { var ( serviceMap = make(map[string]*v1.Service) m = make(map[string]string) ) // Get all service resources from the master (via a cache), // and populate them into service environment variables. if kl.serviceLister == nil { // Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars. return m, nil } services, err := kl.serviceLister.List(labels.Everything()) if err != nil { return m, fmt.Errorf("failed to list services when setting up env vars") } // project the services in namespace ns onto the master services for i := range services { service := services[i] // ignore services where ClusterIP is "None" or empty if !v1helper.IsServiceIPSet(service) { continue } serviceName := service.Name switch service.Namespace { // for the case whether the master service namespace is the namespace the pod // is in, the pod should receive all the services in the namespace. // // ordering of the case clauses below enforces this case ns: serviceMap[serviceName] = service case kl.masterServiceNamespace: if masterServices.Has(serviceName) { if _, exists := serviceMap[serviceName]; !exists { serviceMap[serviceName] = service } } } } mappedServices := []*v1.Service{} for key := range serviceMap { mappedServices = append(mappedServices, serviceMap[key]) } for _, e := range envvars.FromServices(mappedServices) { m[e.Name] = e.Value } return m, nil } // Make the environment variables for a pod in the given namespace. func (kl *Kubelet) makeEnvironmentVariables(pod *v1.Pod, container *v1.Container, podIP string) ([]kubecontainer.EnvVar, error) { var result []kubecontainer.EnvVar // Note: These are added to the docker Config, but are not included in the checksum computed // by kubecontainer.HashContainer(...). That way, we can still determine whether an // v1.Container is already running by its hash. (We don't want to restart a container just // because some service changed.) // // Note that there is a race between Kubelet seeing the pod and kubelet seeing the service. // To avoid this users can: (1) wait between starting a service and starting; or (2) detect // missing service env var and exit and be restarted; or (3) use DNS instead of env vars // and keep trying to resolve the DNS name of the service (recommended). serviceEnv, err := kl.getServiceEnvVarMap(pod.Namespace) if err != nil { return result, err } var ( configMaps = make(map[string]*v1.ConfigMap) secrets = make(map[string]*v1.Secret) tmpEnv = make(map[string]string) ) // Env will override EnvFrom variables. // Process EnvFrom first then allow Env to replace existing values. for _, envFrom := range container.EnvFrom { switch { case envFrom.ConfigMapRef != nil: cm := envFrom.ConfigMapRef name := cm.Name configMap, ok := configMaps[name] if !ok { if kl.kubeClient == nil { return result, fmt.Errorf("Couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) } optional := cm.Optional != nil && *cm.Optional configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) if err != nil { if errors.IsNotFound(err) && optional { // ignore error when marked optional continue } return result, err } configMaps[name] = configMap } invalidKeys := []string{} for k, v := range configMap.Data { if len(envFrom.Prefix) > 0 { k = envFrom.Prefix + k } if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 { invalidKeys = append(invalidKeys, k) continue } tmpEnv[k] = v } if len(invalidKeys) > 0 { sort.Strings(invalidKeys) kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom configMap %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name) } case envFrom.SecretRef != nil: s := envFrom.SecretRef name := s.Name secret, ok := secrets[name] if !ok { if kl.kubeClient == nil { return result, fmt.Errorf("Couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) } optional := s.Optional != nil && *s.Optional secret, err = kl.secretManager.GetSecret(pod.Namespace, name) if err != nil { if errors.IsNotFound(err) && optional { // ignore error when marked optional continue } return result, err } secrets[name] = secret } invalidKeys := []string{} for k, v := range secret.Data { if len(envFrom.Prefix) > 0 { k = envFrom.Prefix + k } if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 { invalidKeys = append(invalidKeys, k) continue } tmpEnv[k] = string(v) } if len(invalidKeys) > 0 { sort.Strings(invalidKeys) kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom secret %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name) } } } // Determine the final values of variables: // // 1. Determine the final value of each variable: // a. If the variable's Value is set, expand the `$(var)` references to other // variables in the .Value field; the sources of variables are the declared // variables of the container and the service environment variables // b. If a source is defined for an environment variable, resolve the source // 2. Create the container's environment in the order variables are declared // 3. Add remaining service environment vars var ( mappingFunc = expansion.MappingFuncFor(tmpEnv, serviceEnv) ) for _, envVar := range container.Env { runtimeVal := envVar.Value if runtimeVal != "" { // Step 1a: expand variable references runtimeVal = expansion.Expand(runtimeVal, mappingFunc) } else if envVar.ValueFrom != nil { // Step 1b: resolve alternate env var sources switch { case envVar.ValueFrom.FieldRef != nil: runtimeVal, err = kl.podFieldSelectorRuntimeValue(envVar.ValueFrom.FieldRef, pod, podIP) if err != nil { return result, err } case envVar.ValueFrom.ResourceFieldRef != nil: defaultedPod, defaultedContainer, err := kl.defaultPodLimitsForDownwardAPI(pod, container) if err != nil { return result, err } runtimeVal, err = containerResourceRuntimeValue(envVar.ValueFrom.ResourceFieldRef, defaultedPod, defaultedContainer) if err != nil { return result, err } case envVar.ValueFrom.ConfigMapKeyRef != nil: cm := envVar.ValueFrom.ConfigMapKeyRef name := cm.Name key := cm.Key optional := cm.Optional != nil && *cm.Optional configMap, ok := configMaps[name] if !ok { if kl.kubeClient == nil { return result, fmt.Errorf("Couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) } configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) if err != nil { if errors.IsNotFound(err) && optional { // ignore error when marked optional continue } return result, err } configMaps[name] = configMap } runtimeVal, ok = configMap.Data[key] if !ok { if optional { continue } return result, fmt.Errorf("Couldn't find key %v in ConfigMap %v/%v", key, pod.Namespace, name) } case envVar.ValueFrom.SecretKeyRef != nil: s := envVar.ValueFrom.SecretKeyRef name := s.Name key := s.Key optional := s.Optional != nil && *s.Optional secret, ok := secrets[name] if !ok { if kl.kubeClient == nil { return result, fmt.Errorf("Couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) } secret, err = kl.secretManager.GetSecret(pod.Namespace, name) if err != nil { if errors.IsNotFound(err) && optional { // ignore error when marked optional continue } return result, err } secrets[name] = secret } runtimeValBytes, ok := secret.Data[key] if !ok { if optional { continue } return result, fmt.Errorf("Couldn't find key %v in Secret %v/%v", key, pod.Namespace, name) } runtimeVal = string(runtimeValBytes) } } // Accesses apiserver+Pods. // So, the master may set service env vars, or kubelet may. In case both are doing // it, we delete the key from the kubelet-generated ones so we don't have duplicate // env vars. // TODO: remove this next line once all platforms use apiserver+Pods. delete(serviceEnv, envVar.Name) tmpEnv[envVar.Name] = runtimeVal } // Append the env vars for k, v := range tmpEnv { result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) } // Append remaining service env vars. for k, v := range serviceEnv { // Accesses apiserver+Pods. // So, the master may set service env vars, or kubelet may. In case both are doing // it, we skip the key from the kubelet-generated ones so we don't have duplicate // env vars. // TODO: remove this next line once all platforms use apiserver+Pods. if _, present := tmpEnv[k]; !present { result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) } } return result, nil } // podFieldSelectorRuntimeValue returns the runtime value of the given // selector for a pod. func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod *v1.Pod, podIP string) (string, error) { internalFieldPath, _, err := podshelper.ConvertDownwardAPIFieldLabel(fs.APIVersion, fs.FieldPath, "") if err != nil { return "", err } switch internalFieldPath { case "spec.nodeName": return pod.Spec.NodeName, nil case "spec.serviceAccountName": return pod.Spec.ServiceAccountName, nil case "status.hostIP": hostIP, err := kl.getHostIPAnyWay() if err != nil { return "", err } return hostIP.String(), nil case "status.podIP": return podIP, nil } return fieldpath.ExtractFieldPathAsString(pod, internalFieldPath) } // containerResourceRuntimeValue returns the value of the provided container resource func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, container *v1.Container) (string, error) { containerName := fs.ContainerName if len(containerName) == 0 { return resource.ExtractContainerResourceValue(fs, container) } return resource.ExtractResourceValueByContainerName(fs, pod, containerName) } // One of the following arguments must be non-nil: runningPod, status. // TODO: Modify containerRuntime.KillPod() to accept the right arguments. func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error { var p kubecontainer.Pod if runningPod != nil { p = *runningPod } else if status != nil { p = kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), status) } else { return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status") } // Call the container runtime KillPod method which stops all running containers of the pod if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil { return err } if err := kl.containerManager.UpdateQOSCgroups(); err != nil { glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err) } return nil } // makePodDataDirs creates the dirs for the pod datas. func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error { uid := pod.UID if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) { return err } if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) { return err } if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) { return err } return nil } // getPullSecretsForPod inspects the Pod and retrieves the referenced pull // secrets. func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret { pullSecrets := []v1.Secret{} for _, secretRef := range pod.Spec.ImagePullSecrets { secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name) if err != nil { glog.Warningf("Unable to retrieve pull secret %s/%s for %s/%s due to %v. The image pull may not succeed.", pod.Namespace, secretRef.Name, pod.Namespace, pod.Name, err) continue } pullSecrets = append(pullSecrets, *secret) } return pullSecrets } // podIsTerminated returns true if pod is in the terminated state ("Failed" or "Succeeded"). func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool { // Check the cached pod status which was set after the last sync. status, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok { // If there is no cached status, use the status from the // apiserver. This is useful if kubelet has recently been // restarted. status = pod.Status } return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) } // IsPodTerminated returns trus if the pod with the provided UID is in a terminated state ("Failed" or "Succeeded") // or if the pod has been deleted or removed func (kl *Kubelet) IsPodTerminated(uid types.UID) bool { pod, podFound := kl.podManager.GetPodByUID(uid) if !podFound { return true } return kl.podIsTerminated(pod) } // IsPodDeleted returns true if the pod is deleted. For the pod to be deleted, either: // 1. The pod object is deleted // 2. The pod's status is evicted // 3. The pod's deletion timestamp is set, and containers are not running func (kl *Kubelet) IsPodDeleted(uid types.UID) bool { pod, podFound := kl.podManager.GetPodByUID(uid) if !podFound { return true } status, statusFound := kl.statusManager.GetPodStatus(pod.UID) if !statusFound { status = pod.Status } return eviction.PodIsEvicted(status) || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) } // PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have // been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server. func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool { if !notRunning(status.ContainerStatuses) { // We shouldnt delete pods that still have running containers glog.V(3).Infof("Pod %q is terminated, but some containers are still running", format.Pod(pod)) return false } // pod's containers should be deleted runtimeStatus, err := kl.podCache.Get(pod.UID) if err != nil { glog.V(3).Infof("Pod %q is terminated, Error getting runtimeStatus from the podCache: %s", format.Pod(pod), err) return false } if len(runtimeStatus.ContainerStatuses) > 0 { glog.V(3).Infof("Pod %q is terminated, but some containers have not been cleaned up: %+v", format.Pod(pod), runtimeStatus.ContainerStatuses) return false } if kl.podVolumesExist(pod.UID) && !kl.keepTerminatedPodVolumes { // We shouldnt delete pods whose volumes have not been cleaned up if we are not keeping terminated pod volumes glog.V(3).Infof("Pod %q is terminated, but some volumes have not been cleaned up", format.Pod(pod)) return false } if kl.kubeletConfiguration.CgroupsPerQOS { pcm := kl.containerManager.NewPodContainerManager() if pcm.Exists(pod) { glog.V(3).Infof("Pod %q is terminated, but pod cgroup sandbox has not been cleaned up", format.Pod(pod)) return false } } return true } // podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status. func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool { status, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok { status = pod.Status } return kl.PodResourcesAreReclaimed(pod, status) } // notRunning returns true if every status is terminated or waiting, or the status list // is empty. func notRunning(statuses []v1.ContainerStatus) bool { for _, status := range statuses { if status.State.Terminated == nil && status.State.Waiting == nil { return false } } return true } // filterOutTerminatedPods returns the given pods which the status manager // does not consider failed or succeeded. func (kl *Kubelet) filterOutTerminatedPods(pods []*v1.Pod) []*v1.Pod { var filteredPods []*v1.Pod for _, p := range pods { if kl.podIsTerminated(p) { continue } filteredPods = append(filteredPods, p) } return filteredPods } // removeOrphanedPodStatuses removes obsolete entries in podStatus where // the pod is no longer considered bound to this node. func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Pod) { podUIDs := make(map[types.UID]bool) for _, pod := range pods { podUIDs[pod.UID] = true } for _, pod := range mirrorPods { podUIDs[pod.UID] = true } kl.statusManager.RemoveOrphanedStatuses(podUIDs) } // HandlePodCleanups performs a series of cleanup work, including terminating // pod workers, killing unwanted pods, and removing orphaned volumes/pod // directories. // NOTE: This function is executed by the main sync loop, so it // should not contain any blocking calls. func (kl *Kubelet) HandlePodCleanups() error { // The kubelet lacks checkpointing, so we need to introspect the set of pods // in the cgroup tree prior to inspecting the set of pods in our pod manager. // this ensures our view of the cgroup tree does not mistakenly observe pods // that are added after the fact... var ( cgroupPods map[types.UID]cm.CgroupName err error ) if kl.cgroupsPerQOS { pcm := kl.containerManager.NewPodContainerManager() cgroupPods, err = pcm.GetAllPodsFromCgroups() if err != nil { return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err) } } allPods, mirrorPods := kl.podManager.GetPodsAndMirrorPods() // Pod phase progresses monotonically. Once a pod has reached a final state, // it should never leave regardless of the restart policy. The statuses // of such pods should not be changed, and there is no need to sync them. // TODO: the logic here does not handle two cases: // 1. If the containers were removed immediately after they died, kubelet // may fail to generate correct statuses, let alone filtering correctly. // 2. If kubelet restarted before writing the terminated status for a pod // to the apiserver, it could still restart the terminated pod (even // though the pod was not considered terminated by the apiserver). // These two conditions could be alleviated by checkpointing kubelet. activePods := kl.filterOutTerminatedPods(allPods) desiredPods := make(map[types.UID]empty) for _, pod := range activePods { desiredPods[pod.UID] = empty{} } // Stop the workers for no-longer existing pods. // TODO: is here the best place to forget pod workers? kl.podWorkers.ForgetNonExistingPodWorkers(desiredPods) kl.probeManager.CleanupPods(activePods) runningPods, err := kl.runtimeCache.GetPods() if err != nil { glog.Errorf("Error listing containers: %#v", err) return err } for _, pod := range runningPods { if _, found := desiredPods[pod.ID]; !found { kl.podKillingCh <- &kubecontainer.PodPair{APIPod: nil, RunningPod: pod} } } kl.removeOrphanedPodStatuses(allPods, mirrorPods) // Note that we just killed the unwanted pods. This may not have reflected // in the cache. We need to bypass the cache to get the latest set of // running pods to clean up the volumes. // TODO: Evaluate the performance impact of bypassing the runtime cache. runningPods, err = kl.containerRuntime.GetPods(false) if err != nil { glog.Errorf("Error listing containers: %#v", err) return err } // Remove any orphaned volumes. // Note that we pass all pods (including terminated pods) to the function, // so that we don't remove volumes associated with terminated but not yet // deleted pods. err = kl.cleanupOrphanedPodDirs(allPods, runningPods) if err != nil { // We want all cleanup tasks to be run even if one of them failed. So // we just log an error here and continue other cleanup tasks. // This also applies to the other clean up tasks. glog.Errorf("Failed cleaning up orphaned pod directories: %v", err) } // Remove any orphaned mirror pods. kl.podManager.DeleteOrphanedMirrorPods() // Remove any cgroups in the hierarchy for pods that are no longer running. if kl.cgroupsPerQOS { kl.cleanupOrphanedPodCgroups(cgroupPods, activePods) } kl.backOff.GC() return nil } // podKiller launches a goroutine to kill a pod received from the channel if // another goroutine isn't already in action. func (kl *Kubelet) podKiller() { killing := sets.NewString() // guard for the killing set lock := sync.Mutex{} for { select { case podPair, ok := <-kl.podKillingCh: if !ok { return } runningPod := podPair.RunningPod apiPod := podPair.APIPod lock.Lock() exists := killing.Has(string(runningPod.ID)) if !exists { killing.Insert(string(runningPod.ID)) } lock.Unlock() if !exists { go func(apiPod *v1.Pod, runningPod *kubecontainer.Pod) { glog.V(2).Infof("Killing unwanted pod %q", runningPod.Name) err := kl.killPod(apiPod, runningPod, nil, nil) if err != nil { glog.Errorf("Failed killing the pod %q: %v", runningPod.Name, err) } lock.Lock() killing.Delete(string(runningPod.ID)) lock.Unlock() }(apiPod, runningPod) } } } } // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current // running container is preferred over a previous termination. If info about the container is not available then a specific // error is returned to the end user. func (kl *Kubelet) validateContainerLogStatus(podName string, podStatus *v1.PodStatus, containerName string, previous bool) (containerID kubecontainer.ContainerID, err error) { var cID string cStatus, found := podutil.GetContainerStatus(podStatus.ContainerStatuses, containerName) // if not found, check the init containers if !found { cStatus, found = podutil.GetContainerStatus(podStatus.InitContainerStatuses, containerName) } if !found { return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is not available", containerName, podName) } lastState := cStatus.LastTerminationState waiting, running, terminated := cStatus.State.Waiting, cStatus.State.Running, cStatus.State.Terminated switch { case previous: if lastState.Terminated == nil || lastState.Terminated.ContainerID == "" { return kubecontainer.ContainerID{}, fmt.Errorf("previous terminated container %q in pod %q not found", containerName, podName) } cID = lastState.Terminated.ContainerID case running != nil: cID = cStatus.ContainerID case terminated != nil: // in cases where the next container didn't start, terminated.ContainerID will be empty, so get logs from the lastState.Terminated. if terminated.ContainerID == "" { if lastState.Terminated != nil && lastState.Terminated.ContainerID != "" { cID = lastState.Terminated.ContainerID } else { return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) } } else { cID = terminated.ContainerID } case lastState.Terminated != nil: if lastState.Terminated.ContainerID == "" { return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) } cID = lastState.Terminated.ContainerID case waiting != nil: // output some info for the most common pending failures switch reason := waiting.Reason; reason { case images.ErrImagePull.Error(): return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: image can't be pulled", containerName, podName) case images.ErrImagePullBackOff.Error(): return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: trying and failing to pull image", containerName, podName) default: return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: %v", containerName, podName, reason) } default: // unrecognized state return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start - no logs yet", containerName, podName) } return kubecontainer.ParseContainerID(cID), nil } // GetKubeletContainerLogs returns logs from the container // TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt // or all of them. func (kl *Kubelet) GetKubeletContainerLogs(podFullName, containerName string, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) error { // Pod workers periodically write status to statusManager. If status is not // cached there, something is wrong (or kubelet just restarted and hasn't // caught up yet). Just assume the pod is not ready yet. name, namespace, err := kubecontainer.ParsePodFullName(podFullName) if err != nil { return fmt.Errorf("unable to parse pod full name %q: %v", podFullName, err) } pod, ok := kl.GetPodByName(namespace, name) if !ok { return fmt.Errorf("pod %q cannot be found - no logs available", name) } podUID := pod.UID if mirrorPod, ok := kl.podManager.GetMirrorPodByPod(pod); ok { podUID = mirrorPod.UID } podStatus, found := kl.statusManager.GetPodStatus(podUID) if !found { // If there is no cached status, use the status from the // apiserver. This is useful if kubelet has recently been // restarted. podStatus = pod.Status } // TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID, // but inside kuberuntime we convert container id back to container name and restart count. // TODO: After separate container log lifecycle management, we should get log based on the existing log files // instead of container status. containerID, err := kl.validateContainerLogStatus(pod.Name, &podStatus, containerName, logOptions.Previous) if err != nil { return err } // Do a zero-byte write to stdout before handing off to the container runtime. // This ensures at least one Write call is made to the writer when copying starts, // even if we then block waiting for log output from the container. if _, err := stdout.Write([]byte{}); err != nil { return err } if kl.dockerLegacyService != nil { // dockerLegacyService should only be non-nil when we actually need it, so // inject it into the runtimeService. // TODO(random-liu): Remove this hack after deprecating unsupported log driver. return kl.dockerLegacyService.GetContainerLogs(pod, containerID, logOptions, stdout, stderr) } return kl.containerRuntime.GetContainerLogs(pod, containerID, logOptions, stdout, stderr) } // getPhase returns the phase of a pod given its container info. func getPhase(spec *v1.PodSpec, info []v1.ContainerStatus) v1.PodPhase { initialized := 0 pendingInitialization := 0 failedInitialization := 0 for _, container := range spec.InitContainers { containerStatus, ok := podutil.GetContainerStatus(info, container.Name) if !ok { pendingInitialization++ continue } switch { case containerStatus.State.Running != nil: pendingInitialization++ case containerStatus.State.Terminated != nil: if containerStatus.State.Terminated.ExitCode == 0 { initialized++ } else { failedInitialization++ } case containerStatus.State.Waiting != nil: if containerStatus.LastTerminationState.Terminated != nil { if containerStatus.LastTerminationState.Terminated.ExitCode == 0 { initialized++ } else { failedInitialization++ } } else { pendingInitialization++ } default: pendingInitialization++ } } unknown := 0 running := 0 waiting := 0 stopped := 0 failed := 0 succeeded := 0 for _, container := range spec.Containers { containerStatus, ok := podutil.GetContainerStatus(info, container.Name) if !ok { unknown++ continue } switch { case containerStatus.State.Running != nil: running++ case containerStatus.State.Terminated != nil: stopped++ if containerStatus.State.Terminated.ExitCode == 0 { succeeded++ } else { failed++ } case containerStatus.State.Waiting != nil: if containerStatus.LastTerminationState.Terminated != nil { stopped++ } else { waiting++ } default: unknown++ } } if failedInitialization > 0 && spec.RestartPolicy == v1.RestartPolicyNever { return v1.PodFailed } switch { case pendingInitialization > 0: fallthrough case waiting > 0: glog.V(5).Infof("pod waiting > 0, pending") // One or more containers has not been started return v1.PodPending case running > 0 && unknown == 0: // All containers have been started, and at least // one container is running return v1.PodRunning case running == 0 && stopped > 0 && unknown == 0: // All containers are terminated if spec.RestartPolicy == v1.RestartPolicyAlways { // All containers are in the process of restarting return v1.PodRunning } if stopped == succeeded { // RestartPolicy is not Always, and all // containers are terminated in success return v1.PodSucceeded } if spec.RestartPolicy == v1.RestartPolicyNever { // RestartPolicy is Never, and all containers are // terminated with at least one in failure return v1.PodFailed } // RestartPolicy is OnFailure, and at least one in failure // and in the process of restarting return v1.PodRunning default: glog.V(5).Infof("pod default case, pending") return v1.PodPending } } // generateAPIPodStatus creates the final API pod status for a pod, given the // internal pod status. func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { glog.V(3).Infof("Generating status for %q", format.Pod(pod)) // check if an internal module has requested the pod is evicted. for _, podSyncHandler := range kl.PodSyncHandlers { if result := podSyncHandler.ShouldEvict(pod); result.Evict { return v1.PodStatus{ Phase: v1.PodFailed, Reason: result.Reason, Message: result.Message, } } } s := kl.convertStatusToAPIStatus(pod, podStatus) // Assume info is ready to process spec := &pod.Spec allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) s.Phase = getPhase(spec, allStatus) // Check for illegal phase transition if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { // API server shows terminal phase; transitions are not allowed if s.Phase != pod.Status.Phase { glog.Errorf("Pod attempted illegal phase transition from %s to %s: %v", pod.Status.Phase, s.Phase, s) // Force back to phase from the API server s.Phase = pod.Status.Phase } } kl.probeManager.UpdatePodStatus(pod.UID, s) s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(spec, s.InitContainerStatuses, s.Phase)) s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(spec, s.ContainerStatuses, s.Phase)) // s (the PodStatus we are creating) will not have a PodScheduled condition yet, because converStatusToAPIStatus() // does not create one. If the existing PodStatus has a PodScheduled condition, then copy it into s and make sure // it is set to true. If the existing PodStatus does not have a PodScheduled condition, then create one that is set to true. if _, oldPodScheduled := podutil.GetPodCondition(&pod.Status, v1.PodScheduled); oldPodScheduled != nil { s.Conditions = append(s.Conditions, *oldPodScheduled) } podutil.UpdatePodCondition(s, &v1.PodCondition{ Type: v1.PodScheduled, Status: v1.ConditionTrue, }) if kl.kubeClient != nil { hostIP, err := kl.getHostIPAnyWay() if err != nil { glog.V(4).Infof("Cannot get host IP: %v", err) } else { s.HostIP = hostIP.String() if kubecontainer.IsHostNetworkPod(pod) && s.PodIP == "" { s.PodIP = hostIP.String() } } } return *s } // convertStatusToAPIStatus creates an api PodStatus for the given pod from // the given internal pod status. It is purely transformative and does not // alter the kubelet state at all. func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *v1.PodStatus { var apiPodStatus v1.PodStatus apiPodStatus.PodIP = podStatus.IP // set status for Pods created on versions of kube older than 1.6 apiPodStatus.QOSClass = v1qos.GetPodQOS(pod) oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID) if !found { oldPodStatus = pod.Status } apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses( pod, podStatus, oldPodStatus.ContainerStatuses, pod.Spec.Containers, len(pod.Spec.InitContainers) > 0, false, ) apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses( pod, podStatus, oldPodStatus.InitContainerStatuses, pod.Spec.InitContainers, len(pod.Spec.InitContainers) > 0, true, ) return &apiPodStatus } // convertToAPIContainerStatuses converts the given internal container // statuses into API container statuses. func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus { convertContainerStatus := func(cs *kubecontainer.ContainerStatus) *v1.ContainerStatus { cid := cs.ID.String() status := &v1.ContainerStatus{ Name: cs.Name, RestartCount: int32(cs.RestartCount), Image: cs.Image, ImageID: cs.ImageID, ContainerID: cid, } switch cs.State { case kubecontainer.ContainerStateRunning: status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)} case kubecontainer.ContainerStateCreated: // Treat containers in the "created" state as if they are exited. // The pod workers are supposed start all containers it creates in // one sync (syncPod) iteration. There should not be any normal // "created" containers when the pod worker generates the status at // the beginning of a sync iteration. fallthrough case kubecontainer.ContainerStateExited: status.State.Terminated = &v1.ContainerStateTerminated{ ExitCode: int32(cs.ExitCode), Reason: cs.Reason, Message: cs.Message, StartedAt: metav1.NewTime(cs.StartedAt), FinishedAt: metav1.NewTime(cs.FinishedAt), ContainerID: cid, } default: status.State.Waiting = &v1.ContainerStateWaiting{} } return status } // Fetch old containers statuses from old pod status. oldStatuses := make(map[string]v1.ContainerStatus, len(containers)) for _, status := range previousStatus { oldStatuses[status.Name] = status } // Set all container statuses to default waiting state statuses := make(map[string]*v1.ContainerStatus, len(containers)) defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: "ContainerCreating"}} if hasInitContainers { defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: "PodInitializing"}} } for _, container := range containers { status := &v1.ContainerStatus{ Name: container.Name, Image: container.Image, State: defaultWaitingState, } oldStatus, found := oldStatuses[container.Name] if found { if oldStatus.State.Terminated != nil { // Do not update status on terminated init containers as // they be removed at any time. status = &oldStatus } else { // Apply some values from the old statuses as the default values. status.RestartCount = oldStatus.RestartCount status.LastTerminationState = oldStatus.LastTerminationState } } statuses[container.Name] = status } // Make the latest container status comes first. sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(podStatus.ContainerStatuses))) // Set container statuses according to the statuses seen in pod status containerSeen := map[string]int{} for _, cStatus := range podStatus.ContainerStatuses { cName := cStatus.Name if _, ok := statuses[cName]; !ok { // This would also ignore the infra container. continue } if containerSeen[cName] >= 2 { continue } status := convertContainerStatus(cStatus) if containerSeen[cName] == 0 { statuses[cName] = status } else { statuses[cName].LastTerminationState = status.State } containerSeen[cName] = containerSeen[cName] + 1 } // Handle the containers failed to be started, which should be in Waiting state. for _, container := range containers { if isInitContainer { // If the init container is terminated with exit code 0, it won't be restarted. // TODO(random-liu): Handle this in a cleaner way. s := podStatus.FindContainerStatusByName(container.Name) if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 { continue } } // If a container should be restarted in next syncpod, it is *Waiting*. if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) { continue } status := statuses[container.Name] reason, ok := kl.reasonCache.Get(pod.UID, container.Name) if !ok { // In fact, we could also apply Waiting state here, but it is less informative, // and the container will be restarted soon, so we prefer the original state here. // Note that with the current implementation of ShouldContainerBeRestarted the original state here // could be: // * Waiting: There is no associated historical container and start failure reason record. // * Terminated: The container is terminated. continue } if status.State.Terminated != nil { status.LastTerminationState = status.State } status.State = v1.ContainerState{ Waiting: &v1.ContainerStateWaiting{ Reason: reason.Err.Error(), Message: reason.Message, }, } statuses[container.Name] = status } var containerStatuses []v1.ContainerStatus for _, status := range statuses { containerStatuses = append(containerStatuses, *status) } // Sort the container statuses since clients of this interface expect the list // of containers in a pod has a deterministic order. if isInitContainer { kubetypes.SortInitContainerStatuses(pod, containerStatuses) } else { sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses)) } return containerStatuses } // ServeLogs returns logs of current machine. func (kl *Kubelet) ServeLogs(w http.ResponseWriter, req *http.Request) { // TODO: whitelist logs we are willing to serve kl.logServer.ServeHTTP(w, req) } // findContainer finds and returns the container with the given pod ID, full name, and container name. // It returns nil if not found. func (kl *Kubelet) findContainer(podFullName string, podUID types.UID, containerName string) (*kubecontainer.Container, error) { pods, err := kl.containerRuntime.GetPods(false) if err != nil { return nil, err } // Resolve and type convert back again. // We need the static pod UID but the kubecontainer API works with types.UID. podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) return pod.FindContainerByName(containerName), nil } // RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes func (kl *Kubelet) RunInContainer(podFullName string, podUID types.UID, containerName string, cmd []string) ([]byte, error) { container, err := kl.findContainer(podFullName, podUID, containerName) if err != nil { return nil, err } if container == nil { return nil, fmt.Errorf("container not found (%q)", containerName) } // TODO(tallclair): Pass a proper timeout value. return kl.runner.RunInContainer(container.ID, cmd, 0) } // ExecInContainer executes a command in a container, connecting the supplied // stdin/stdout/stderr to the command's IO streams. func (kl *Kubelet) ExecInContainer(podFullName string, podUID types.UID, containerName string, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error { streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime) if !ok { return fmt.Errorf("streaming methods not supported by runtime") } container, err := kl.findContainer(podFullName, podUID, containerName) if err != nil { return err } if container == nil { return fmt.Errorf("container not found (%q)", containerName) } return streamingRuntime.ExecInContainer(container.ID, cmd, stdin, stdout, stderr, tty, resize, timeout) } // AttachContainer uses the container runtime to attach the given streams to // the given container. func (kl *Kubelet) AttachContainer(podFullName string, podUID types.UID, containerName string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize) error { streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime) if !ok { return fmt.Errorf("streaming methods not supported by runtime") } container, err := kl.findContainer(podFullName, podUID, containerName) if err != nil { return err } if container == nil { return fmt.Errorf("container not found (%q)", containerName) } return streamingRuntime.AttachContainer(container.ID, stdin, stdout, stderr, tty, resize) } // PortForward connects to the pod's port and copies data between the port // and the stream. func (kl *Kubelet) PortForward(podFullName string, podUID types.UID, port int32, stream io.ReadWriteCloser) error { streamingRuntime, ok := kl.containerRuntime.(kubecontainer.DirectStreamingRuntime) if !ok { return fmt.Errorf("streaming methods not supported by runtime") } pods, err := kl.containerRuntime.GetPods(false) if err != nil { return err } // Resolve and type convert back again. // We need the static pod UID but the kubecontainer API works with types.UID. podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) if pod.IsEmpty() { return fmt.Errorf("pod not found (%q)", podFullName) } return streamingRuntime.PortForward(&pod, port, stream) } // GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it. func (kl *Kubelet) GetExec(podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommandserver.Options) (*url.URL, error) { switch streamingRuntime := kl.containerRuntime.(type) { case kubecontainer.DirectStreamingRuntime: // Kubelet will serve the exec directly. return nil, nil case kubecontainer.IndirectStreamingRuntime: container, err := kl.findContainer(podFullName, podUID, containerName) if err != nil { return nil, err } if container == nil { return nil, fmt.Errorf("container not found (%q)", containerName) } return streamingRuntime.GetExec(container.ID, cmd, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, streamOpts.TTY) default: return nil, fmt.Errorf("container runtime does not support exec") } } // GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it. func (kl *Kubelet) GetAttach(podFullName string, podUID types.UID, containerName string, streamOpts remotecommandserver.Options) (*url.URL, error) { switch streamingRuntime := kl.containerRuntime.(type) { case kubecontainer.DirectStreamingRuntime: // Kubelet will serve the attach directly. return nil, nil case kubecontainer.IndirectStreamingRuntime: container, err := kl.findContainer(podFullName, podUID, containerName) if err != nil { return nil, err } if container == nil { return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) } // The TTY setting for attach must match the TTY setting in the initial container configuration, // since whether the process is running in a TTY cannot be changed after it has started. We // need the api.Pod to get the TTY status. pod, found := kl.GetPodByFullName(podFullName) if !found || (string(podUID) != "" && pod.UID != podUID) { return nil, fmt.Errorf("pod %s not found", podFullName) } containerSpec := kubecontainer.GetContainerSpec(pod, containerName) if containerSpec == nil { return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) } tty := containerSpec.TTY return streamingRuntime.GetAttach(container.ID, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, tty) default: return nil, fmt.Errorf("container runtime does not support attach") } } // GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it. func (kl *Kubelet) GetPortForward(podName, podNamespace string, podUID types.UID, portForwardOpts portforward.V4Options) (*url.URL, error) { switch streamingRuntime := kl.containerRuntime.(type) { case kubecontainer.DirectStreamingRuntime: // Kubelet will serve the attach directly. return nil, nil case kubecontainer.IndirectStreamingRuntime: pods, err := kl.containerRuntime.GetPods(false) if err != nil { return nil, err } // Resolve and type convert back again. // We need the static pod UID but the kubecontainer API works with types.UID. podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) podFullName := kubecontainer.BuildPodFullName(podName, podNamespace) pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) if pod.IsEmpty() { return nil, fmt.Errorf("pod not found (%q)", podFullName) } return streamingRuntime.GetPortForward(podName, podNamespace, podUID, portForwardOpts.Ports) default: return nil, fmt.Errorf("container runtime does not support port-forward") } } // cleanupOrphanedPodCgroups removes cgroups that should no longer exist. // it reconciles the cached state of cgroupPods with the specified list of runningPods func (kl *Kubelet) cleanupOrphanedPodCgroups(cgroupPods map[types.UID]cm.CgroupName, activePods []*v1.Pod) { // Add all running pods to the set that we want to preserve podSet := sets.NewString() for _, pod := range activePods { podSet.Insert(string(pod.UID)) } pcm := kl.containerManager.NewPodContainerManager() // Iterate over all the found pods to verify if they should be running for uid, val := range cgroupPods { // if the pod is in the running set, its not a candidate for cleanup if podSet.Has(string(uid)) { continue } // If volumes have not been unmounted/detached, do not delete the cgroup // so any memory backed volumes don't have their charges propagated to the // parent croup. If the volumes still exist, reduce the cpu shares for any // process in the cgroup to the minimum value while we wait. if the kubelet // is configured to keep terminated volumes, we will delete the cgroup and not block. if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.keepTerminatedPodVolumes { glog.V(3).Infof("Orphaned pod %q found, but volumes not yet removed. Reducing cpu to minimum", uid) if err := pcm.ReduceCPULimits(val); err != nil { glog.Warningf("Failed to reduce cpu time for pod %q pending volume cleanup due to %v", uid, err) } continue } glog.V(3).Infof("Orphaned pod %q found, removing pod cgroups", uid) // Destroy all cgroups of pod that should not be running, // by first killing all the attached processes to these cgroups. // We ignore errors thrown by the method, as the housekeeping loop would // again try to delete these unwanted pod cgroups go pcm.Destroy(val) } } // enableHostUserNamespace determines if the host user namespace should be used by the container runtime. // Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced // capability, the pod contains a privileged container, or the pod has a host path volume. // // NOTE: when if a container shares any namespace with another container it must also share the user namespace // or it will not have the correct capabilities in the namespace. This means that host user namespace // is enabled per pod, not per container. func (kl *Kubelet) enableHostUserNamespace(pod *v1.Pod) bool { if kubecontainer.HasPrivilegedContainer(pod) || hasHostNamespace(pod) || hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) { return true } return false } // hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container. func hasNonNamespacedCapability(pod *v1.Pod) bool { for _, c := range pod.Spec.Containers { if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil { for _, cap := range c.SecurityContext.Capabilities.Add { if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" { return true } } } } return false } // hasHostVolume returns true if the pod spec has a HostPath volume. func hasHostVolume(pod *v1.Pod) bool { for _, v := range pod.Spec.Volumes { if v.HostPath != nil { return true } } return false } // hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true. func hasHostNamespace(pod *v1.Pod) bool { if pod.Spec.SecurityContext == nil { return false } return pod.Spec.HostIPC || pod.Spec.HostNetwork || pod.Spec.HostPID } // hasHostMountPVC returns true if a PVC is referencing a HostPath volume. func (kl *Kubelet) hasHostMountPVC(pod *v1.Pod) bool { for _, volume := range pod.Spec.Volumes { if volume.PersistentVolumeClaim != nil { pvc, err := kl.kubeClient.CoreV1().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName, metav1.GetOptions{}) if err != nil { glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err) continue } if pvc != nil { referencedVolume, err := kl.kubeClient.CoreV1().PersistentVolumes().Get(pvc.Spec.VolumeName, metav1.GetOptions{}) if err != nil { glog.Warningf("unable to retrieve pv %s - %v", pvc.Spec.VolumeName, err) continue } if referencedVolume != nil && referencedVolume.Spec.HostPath != nil { return true } } } } return false }