/* Copyright 2015 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package rkt import ( "bufio" "bytes" "encoding/json" "fmt" "io" "io/ioutil" "os" "os/exec" "path" "path/filepath" "sort" "strconv" "strings" "sync" "syscall" "time" appcschema "github.com/appc/spec/schema" appctypes "github.com/appc/spec/schema/types" "github.com/coreos/go-systemd/unit" rktapi "github.com/coreos/rkt/api/v1alpha" "github.com/golang/glog" "golang.org/x/net/context" "google.golang.org/grpc" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubetypes "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/uuid" utilwait "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" "k8s.io/client-go/tools/remotecommand" "k8s.io/client-go/util/flowcontrol" "k8s.io/kubernetes/pkg/credentialprovider" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/leaky" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/network" "k8s.io/kubernetes/pkg/kubelet/network/hairpin" proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/securitycontext" "k8s.io/kubernetes/pkg/util/selinux" utilstrings "k8s.io/kubernetes/pkg/util/strings" "k8s.io/kubernetes/pkg/util/term" utilexec "k8s.io/utils/exec" ) const ( RktType = "rkt" DefaultRktAPIServiceEndpoint = "localhost:15441" minimumRktBinVersion = "1.13.0" minimumRktApiVersion = "1.0.0-alpha" minimumSystemdVersion = "219" systemdServiceDir = "/run/systemd/system" rktDataDir = "/var/lib/rkt" rktLocalConfigDir = "/etc/rkt" kubernetesUnitPrefix = "k8s_" unitKubernetesSection = "X-Kubernetes" unitPodUID = "PodUID" unitPodName = "PodName" unitPodNamespace = "PodNamespace" unitPodHostNetwork = "PodHostNetwork" unitPodNetworkNamespace = "PodNetworkNamespace" k8sRktKubeletAnno = "rkt.kubernetes.io/managed-by-kubelet" k8sRktKubeletAnnoValue = "true" k8sRktContainerHashAnno = "rkt.kubernetes.io/container-hash" k8sRktRestartCountAnno = "rkt.kubernetes.io/restart-count" k8sRktTerminationMessagePathAnno = "rkt.kubernetes.io/termination-message-path" k8sRktLimitNoFileAnno = "systemd-unit-option.rkt.kubernetes.io/LimitNOFILE" // TODO(euank): This has significant security concerns as a stage1 image is // effectively root. // Furthermore, this (using an annotation) is a hack to pass an extra // non-portable argument in. It should not be relied on to be stable. // In the future, this might be subsumed by a first-class api object, or by a // kitchen-sink params object (#17064). // See discussion in #23944 // Also, do we want more granularity than path-at-the-kubelet-level and // image/name-at-the-pod-level? k8sRktStage1NameAnno = "rkt.alpha.kubernetes.io/stage1-name-override" dockerPrefix = "docker://" authDir = "auth.d" dockerAuthTemplate = `{"rktKind":"dockerAuth","rktVersion":"v1","registries":[%q],"credentials":{"user":%q,"password":%q}}` defaultRktAPIServiceAddr = "localhost:15441" // ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified) // we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative. // hence, setting ndots to be 5. // TODO(yifan): Move this and dockertools.ndotsDNSOption to a common package. defaultDNSOption = "ndots:5" // Annotations for the ENTRYPOINT and CMD for an ACI that's converted from Docker image. // Taken from https://github.com/appc/docker2aci/blob/v0.12.3/lib/common/common.go#L33 appcDockerEntrypoint = "appc.io/docker/entrypoint" appcDockerCmd = "appc.io/docker/cmd" appcDockerRegistryURL = "appc.io/docker/registryurl" appcDockerRepository = "appc.io/docker/repository" // TODO(yifan): Reuse this const with Docker runtime. minimumGracePeriodInSeconds = 2 // The network name of the network when no-op plugin is being used. // TODO(yifan): This is not ideal since today we cannot make the rkt's 'net.d' dir point to the // CNI directory specified by kubelet. Once that is fixed, we can just use the network config // under the CNI directory directly. // See https://github.com/coreos/rkt/pull/2312#issuecomment-200068370. defaultNetworkName = "rkt.kubernetes.io" // defaultRequestTimeout is the default timeout of rkt requests. // Value is slightly offset from 2 minutes to make timeouts due to this // constant recognizable. defaultRequestTimeout = 2*time.Minute - 1*time.Second etcHostsPath = "/etc/hosts" etcResolvConfPath = "/etc/resolv.conf" ) // Runtime implements the Containerruntime for rkt. The implementation // uses systemd, so in order to run this runtime, systemd must be installed // on the machine. type Runtime struct { cli cliInterface systemd systemdInterface // The grpc client for rkt api-service. apisvcConn *grpc.ClientConn apisvc rktapi.PublicAPIClient config *Config // TODO(yifan): Refactor this to be generic keyring. dockerKeyring credentialprovider.DockerKeyring containerRefManager *kubecontainer.RefManager podDeletionProvider podDeletionProvider runtimeHelper kubecontainer.RuntimeHelper recorder record.EventRecorder livenessManager proberesults.Manager imagePuller images.ImageManager runner kubecontainer.HandlerRunner execer utilexec.Interface os kubecontainer.OSInterface // Network plugin manager. network *network.PluginManager // If true, the "hairpin mode" flag is set on container interfaces. // A false value means the kubelet just backs off from setting it, // it might already be true. configureHairpinMode bool // used for a systemd Exec, which requires the full path. touchPath string nsenterPath string versions versions // requestTimeout is the timeout of rkt requests. requestTimeout time.Duration unitGetter unitServiceGetter } // Field of the X-Kubernetes directive of a systemd service file type podServiceDirective struct { id string name string namespace string hostNetwork bool networkNamespace kubecontainer.ContainerID } var _ kubecontainer.Runtime = &Runtime{} var _ kubecontainer.DirectStreamingRuntime = &Runtime{} // podDeletionProvider can determine if a pod is deleted type podDeletionProvider interface { IsPodDeleted(kubetypes.UID) bool } // cliInterface wrapps the command line calls for testing purpose. type cliInterface interface { // RunCommand creates rkt commands and runs it with the given config. // If the config is nil, it will use the one inferred from rkt API service. RunCommand(config *Config, args ...string) (result []string, err error) } // unitServiceGetter wrapps the systemd open files for testing purpose type unitServiceGetter interface { getKubernetesDirective(string) (podServiceDirective, error) getNetworkNamespace(kubetypes.UID, *rktapi.Pod) (kubecontainer.ContainerID, error) } // New creates the rkt container runtime which implements the container runtime interface. // It will test if the rkt binary is in the $PATH, and whether we can get the // version of it. If so, creates the rkt container runtime, otherwise returns an error. func New( apiEndpoint string, config *Config, runtimeHelper kubecontainer.RuntimeHelper, recorder record.EventRecorder, containerRefManager *kubecontainer.RefManager, podDeletionProvider podDeletionProvider, livenessManager proberesults.Manager, httpClient types.HttpGetter, networkPlugin network.NetworkPlugin, hairpinMode bool, execer utilexec.Interface, os kubecontainer.OSInterface, imageBackOff *flowcontrol.Backoff, serializeImagePulls bool, imagePullQPS float32, imagePullBurst int, requestTimeout time.Duration, ) (*Runtime, error) { // Create dbus connection. systemd, err := newSystemd() if err != nil { return nil, fmt.Errorf("rkt: cannot create systemd interface: %v", err) } // TODO(yifan): Use secure connection. apisvcConn, err := grpc.Dial(apiEndpoint, grpc.WithInsecure()) if err != nil { return nil, fmt.Errorf("rkt: cannot connect to rkt api service: %v", err) } // TODO(yifan): Get the rkt path from API service. if config.Path == "" { // No default rkt path was set, so try to find one in $PATH. var err error config.Path, err = execer.LookPath("rkt") if err != nil { return nil, fmt.Errorf("cannot find rkt binary: %v", err) } } touchPath, err := execer.LookPath("touch") if err != nil { return nil, fmt.Errorf("cannot find touch binary: %v", err) } nsenterPath, err := execer.LookPath("nsenter") if err != nil { return nil, fmt.Errorf("cannot find nsenter binary: %v", err) } if requestTimeout == 0 { requestTimeout = defaultRequestTimeout } rkt := &Runtime{ os: kubecontainer.RealOS{}, systemd: systemd, apisvcConn: apisvcConn, apisvc: rktapi.NewPublicAPIClient(apisvcConn), config: config, dockerKeyring: credentialprovider.NewDockerKeyring(), containerRefManager: containerRefManager, podDeletionProvider: podDeletionProvider, runtimeHelper: runtimeHelper, recorder: recorder, livenessManager: livenessManager, network: network.NewPluginManager(networkPlugin), execer: execer, touchPath: touchPath, nsenterPath: nsenterPath, requestTimeout: requestTimeout, } rkt.config, err = rkt.getConfig(rkt.config) if err != nil { return nil, fmt.Errorf("rkt: cannot get config from rkt api service: %v", err) } cmdRunner := kubecontainer.DirectStreamingRunner(rkt) rkt.runner = lifecycle.NewHandlerRunner(httpClient, cmdRunner, rkt) rkt.imagePuller = images.NewImageManager(recorder, rkt, imageBackOff, serializeImagePulls, imagePullQPS, imagePullBurst) if err := rkt.getVersions(); err != nil { return nil, fmt.Errorf("rkt: error getting version info: %v", err) } rkt.cli = rkt rkt.unitGetter = rkt return rkt, nil } func buildCommand(config *Config, args ...string) *exec.Cmd { cmd := exec.Command(config.Path) cmd.Args = append(cmd.Args, config.buildGlobalOptions()...) cmd.Args = append(cmd.Args, args...) return cmd } // convertToACName converts a string into ACName. func convertToACName(name string) appctypes.ACName { // Note that as the 'name' already matches 'DNS_LABEL' // defined in pkg/api/types.go, there shouldn't be error or panic. acname, _ := appctypes.SanitizeACName(name) return *appctypes.MustACName(acname) } // RunCommand invokes rkt binary with arguments and returns the result // from stdout in a list of strings. Each string in the list is a line. // If config is non-nil, it will use the given config instead of the config // inferred from rkt API service. func (r *Runtime) RunCommand(config *Config, args ...string) ([]string, error) { if config == nil { config = r.config } glog.V(4).Infof("rkt: Run command: %q with config: %#v", args, config) var stdout, stderr bytes.Buffer cmd := buildCommand(config, args...) cmd.Stdout, cmd.Stderr = &stdout, &stderr if err := cmd.Run(); err != nil { return nil, fmt.Errorf("failed to run %v: %v\nstdout: %v\nstderr: %v", args, err, stdout.String(), stderr.String()) } return strings.Split(strings.TrimSpace(stdout.String()), "\n"), nil } // makePodServiceFileName constructs the unit file name for a pod using its rkt pod uuid. func makePodServiceFileName(uuid string) string { // TODO(yifan): Add name for readability? We need to consider the // limit of the length. return fmt.Sprintf("%s%s.service", kubernetesUnitPrefix, uuid) } func getRktUUIDFromServiceFileName(filename string) string { return strings.TrimPrefix(strings.TrimSuffix(filename, path.Ext(filename)), kubernetesUnitPrefix) } // setIsolators sets the apps' isolators according to the security context and resource spec. func setIsolators(app *appctypes.App, c *v1.Container, ctx *v1.SecurityContext) error { var isolators []appctypes.Isolator // Capabilities isolators. if ctx != nil { var addCaps, dropCaps []string if ctx.Capabilities != nil { addCaps, dropCaps = kubecontainer.MakeCapabilities(ctx.Capabilities.Add, ctx.Capabilities.Drop) } if ctx.Privileged != nil && *ctx.Privileged { addCaps, dropCaps = allCapabilities(), []string{} } if len(addCaps) > 0 { set, err := appctypes.NewLinuxCapabilitiesRetainSet(addCaps...) if err != nil { return err } isolator, err := set.AsIsolator() if err != nil { return err } isolators = append(isolators, *isolator) } if len(dropCaps) > 0 { set, err := appctypes.NewLinuxCapabilitiesRevokeSet(dropCaps...) if err != nil { return err } isolator, err := set.AsIsolator() if err != nil { return err } isolators = append(isolators, *isolator) } } // Resources isolators. type resource struct { limit string request string } // If limit is empty, populate it with request and vice versa. resources := make(map[v1.ResourceName]*resource) for name, quantity := range c.Resources.Limits { resources[name] = &resource{limit: quantity.String(), request: quantity.String()} } for name, quantity := range c.Resources.Requests { r, ok := resources[name] if ok { r.request = quantity.String() continue } resources[name] = &resource{limit: quantity.String(), request: quantity.String()} } for name, res := range resources { switch name { case v1.ResourceCPU: cpu, err := appctypes.NewResourceCPUIsolator(res.request, res.limit) if err != nil { return err } isolators = append(isolators, cpu.AsIsolator()) case v1.ResourceMemory: memory, err := appctypes.NewResourceMemoryIsolator(res.request, res.limit) if err != nil { return err } isolators = append(isolators, memory.AsIsolator()) default: return fmt.Errorf("resource type not supported: %v", name) } } if ok := securitycontext.AddNoNewPrivileges(ctx); ok { isolator, err := newNoNewPrivilegesIsolator(true) if err != nil { return err } isolators = append(isolators, *isolator) } mergeIsolators(app, isolators) return nil } // mergeIsolators replaces the app.Isolators with isolators. func mergeIsolators(app *appctypes.App, isolators []appctypes.Isolator) { for _, is := range isolators { found := false for j, js := range app.Isolators { if is.Name.Equals(js.Name) { switch is.Name { case appctypes.LinuxCapabilitiesRetainSetName: // TODO(yifan): More fine grain merge for capability set instead of override. fallthrough case appctypes.LinuxCapabilitiesRevokeSetName: fallthrough case appctypes.ResourceCPUName: fallthrough case appctypes.ResourceMemoryName: app.Isolators[j] = is default: panic(fmt.Sprintf("unexpected isolator name: %v", is.Name)) } found = true break } } if !found { app.Isolators = append(app.Isolators, is) } } } // mergeEnv merges the optEnv with the image's environments. // The environments defined in the image will be overridden by // the ones with the same name in optEnv. func mergeEnv(app *appctypes.App, optEnv []kubecontainer.EnvVar) { envMap := make(map[string]string) for _, e := range app.Environment { envMap[e.Name] = e.Value } for _, e := range optEnv { envMap[e.Name] = e.Value } app.Environment = nil for name, value := range envMap { app.Environment = append(app.Environment, appctypes.EnvironmentVariable{ Name: name, Value: value, }) } } // mergeMounts merges the mountPoints with the image's mount points. // The mount points defined in the image will be overridden by the ones // with the same container path. func mergeMounts(app *appctypes.App, mountPoints []appctypes.MountPoint) { mountMap := make(map[string]appctypes.MountPoint) for _, m := range app.MountPoints { mountMap[m.Path] = m } for _, m := range mountPoints { mountMap[m.Path] = m } app.MountPoints = nil for _, mount := range mountMap { app.MountPoints = append(app.MountPoints, mount) } } // mergePortMappings merges the containerPorts with the image's container ports. // The port mappings defined in the image will be overridden by the ones // with the same name in optPortMappings. func mergePortMappings(app *appctypes.App, containerPorts []appctypes.Port) { portMap := make(map[appctypes.ACName]appctypes.Port) for _, p := range app.Ports { portMap[p.Name] = p } for _, p := range containerPorts { portMap[p.Name] = p } app.Ports = nil for _, port := range portMap { app.Ports = append(app.Ports, port) } } func verifyNonRoot(app *appctypes.App, ctx *v1.SecurityContext) error { if ctx != nil && ctx.RunAsNonRoot != nil && *ctx.RunAsNonRoot { if ctx.RunAsUser != nil && *ctx.RunAsUser == 0 { return fmt.Errorf("container's runAsUser breaks non-root policy") } if ctx.RunAsUser == nil && app.User == "0" { return fmt.Errorf("container has no runAsUser and image will run as root") } } return nil } func setSupplementalGIDs(app *appctypes.App, podCtx *v1.PodSecurityContext, supplementalGids []int64) { if podCtx != nil || len(supplementalGids) != 0 { app.SupplementaryGIDs = app.SupplementaryGIDs[:0] } if podCtx != nil { for _, v := range podCtx.SupplementalGroups { app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(v)) } if podCtx.FSGroup != nil { app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(*podCtx.FSGroup)) } } for _, v := range supplementalGids { app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(v)) } } // setApp merges the container spec with the image's manifest. func setApp(imgManifest *appcschema.ImageManifest, c *v1.Container, mountPoints []appctypes.MountPoint, containerPorts []appctypes.Port, envs []kubecontainer.EnvVar, ctx *v1.SecurityContext, podCtx *v1.PodSecurityContext, supplementalGids []int64) error { app := imgManifest.App // Set up Exec. var command, args []string cmd, ok := imgManifest.Annotations.Get(appcDockerEntrypoint) if ok { err := json.Unmarshal([]byte(cmd), &command) if err != nil { return fmt.Errorf("cannot unmarshal ENTRYPOINT %q: %v", cmd, err) } } ag, ok := imgManifest.Annotations.Get(appcDockerCmd) if ok { err := json.Unmarshal([]byte(ag), &args) if err != nil { return fmt.Errorf("cannot unmarshal CMD %q: %v", ag, err) } } userCommand, userArgs := kubecontainer.ExpandContainerCommandAndArgs(c, envs) if len(userCommand) > 0 { command = userCommand args = nil // If 'command' is specified, then drop the default args. } if len(userArgs) > 0 { args = userArgs } exec := append(command, args...) if len(exec) > 0 { app.Exec = exec } // Set UID and GIDs. if err := verifyNonRoot(app, ctx); err != nil { return err } if ctx != nil && ctx.RunAsUser != nil { app.User = strconv.Itoa(int(*ctx.RunAsUser)) } setSupplementalGIDs(app, podCtx, supplementalGids) // If 'User' or 'Group' are still empty at this point, // then apply the root UID and GID. // TODO(yifan): If only the GID is empty, rkt should be able to determine the GID // using the /etc/passwd file in the image. // See https://github.com/appc/docker2aci/issues/175. // Maybe we can remove this check in the future. if app.User == "" { app.User = "0" app.Group = "0" } if app.Group == "" { return fmt.Errorf("cannot determine the GID of the app %q", imgManifest.Name) } // Set working directory. if len(c.WorkingDir) > 0 { app.WorkingDirectory = c.WorkingDir } // Notes that we don't create Mounts section in the pod manifest here, // as Mounts will be automatically generated by rkt. mergeMounts(app, mountPoints) mergeEnv(app, envs) mergePortMappings(app, containerPorts) return setIsolators(app, c, ctx) } // makePodManifest transforms a kubelet pod spec to the rkt pod manifest. func (r *Runtime) makePodManifest(pod *v1.Pod, podIP string, pullSecrets []v1.Secret) (*appcschema.PodManifest, error) { manifest := appcschema.BlankPodManifest() ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{ Detail: true, Filters: kubernetesPodFilters(pod.UID), }) if err != nil { return nil, fmt.Errorf("couldn't list pods: %v", err) } restartCount := 0 for _, pod := range listResp.Pods { manifest := &appcschema.PodManifest{} err = json.Unmarshal(pod.Manifest, manifest) if err != nil { glog.Warningf("rkt: error unmatshaling pod manifest: %v", err) continue } if countString, ok := manifest.Annotations.Get(k8sRktRestartCountAnno); ok { num, err := strconv.Atoi(countString) if err != nil { glog.Warningf("rkt: error reading restart count on pod: %v", err) continue } if num+1 > restartCount { restartCount = num + 1 } } } requiresPrivileged := false manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktKubeletAnno), k8sRktKubeletAnnoValue) manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodUIDLabel), string(pod.UID)) manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodNameLabel), pod.Name) manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodNamespaceLabel), pod.Namespace) manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesContainerNameLabel), leaky.PodInfraContainerName) manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktRestartCountAnno), strconv.Itoa(restartCount)) if stage1Name, ok := pod.Annotations[k8sRktStage1NameAnno]; ok { requiresPrivileged = true manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktStage1NameAnno), stage1Name) } for _, c := range pod.Spec.Containers { err := r.newAppcRuntimeApp(pod, podIP, c, requiresPrivileged, pullSecrets, manifest) if err != nil { return nil, err } } // TODO(yifan): Set pod-level isolators once it's supported in kubernetes. return manifest, nil } func copyfile(src, dst string) error { data, err := ioutil.ReadFile(src) if err != nil { return err } return ioutil.WriteFile(dst, data, 0644) } // TODO(yifan): Can make rkt handle this when '--net=host'. See https://github.com/coreos/rkt/issues/2430. func makeHostNetworkMount(opts *kubecontainer.RunContainerOptions) (*kubecontainer.Mount, *kubecontainer.Mount, error) { mountHosts, mountResolvConf := true, true for _, mnt := range opts.Mounts { switch mnt.ContainerPath { case etcHostsPath: mountHosts = false case etcResolvConfPath: mountResolvConf = false } } var hostsMount, resolvMount kubecontainer.Mount if mountHosts { hostsPath := filepath.Join(opts.PodContainerDir, "etc-hosts") if err := copyfile(etcHostsPath, hostsPath); err != nil { return nil, nil, err } hostsMount = kubecontainer.Mount{ Name: "kubernetes-hostnetwork-hosts-conf", ContainerPath: etcHostsPath, HostPath: hostsPath, } opts.Mounts = append(opts.Mounts, hostsMount) } if mountResolvConf { resolvPath := filepath.Join(opts.PodContainerDir, "etc-resolv-conf") if err := copyfile(etcResolvConfPath, resolvPath); err != nil { return nil, nil, err } resolvMount = kubecontainer.Mount{ Name: "kubernetes-hostnetwork-resolv-conf", ContainerPath: etcResolvConfPath, HostPath: resolvPath, } opts.Mounts = append(opts.Mounts, resolvMount) } return &hostsMount, &resolvMount, nil } // podFinishedMarkerPath returns the path to a file which should be used to // indicate the pod exiting, and the time thereof. // If the file at the path does not exist, the pod should not be exited. If it // does exist, then the ctime of the file should indicate the time the pod // exited. func podFinishedMarkerPath(podDir string, rktUID string) string { return filepath.Join(podDir, "finished-"+rktUID) } func podFinishedMarkCommand(touchPath, podDir, rktUID string) string { // TODO, if the path has a `'` character in it, this breaks. return touchPath + " " + podFinishedMarkerPath(podDir, rktUID) } // podFinishedAt returns the time that a pod exited, or a zero time if it has // not. func (r *Runtime) podFinishedAt(podUID kubetypes.UID, rktUID string) time.Time { markerFile := podFinishedMarkerPath(r.runtimeHelper.GetPodDir(podUID), rktUID) stat, err := r.os.Stat(markerFile) if err != nil { if !os.IsNotExist(err) { glog.Warningf("rkt: unexpected fs error checking pod finished marker: %v", err) } return time.Time{} } return stat.ModTime() } func (r *Runtime) makeContainerLogMount(opts *kubecontainer.RunContainerOptions, container *v1.Container) (*kubecontainer.Mount, error) { if opts.PodContainerDir == "" || container.TerminationMessagePath == "" { return nil, nil } // In docker runtime, the container log path contains the container ID. // However, for rkt runtime, we cannot get the container ID before the // the container is launched, so here we generate a random uuid to enable // us to map a container's termination message path to a unique log file // on the disk. randomUID := uuid.NewUUID() containerLogPath := path.Join(opts.PodContainerDir, string(randomUID)) fs, err := r.os.Create(containerLogPath) if err != nil { return nil, err } if err := fs.Close(); err != nil { return nil, err } mnt := kubecontainer.Mount{ // Use a random name for the termination message mount, so that // when a container restarts, it will not overwrite the old termination // message. Name: fmt.Sprintf("termination-message-%s", randomUID), ContainerPath: container.TerminationMessagePath, HostPath: containerLogPath, ReadOnly: false, } opts.Mounts = append(opts.Mounts, mnt) return &mnt, nil } func (r *Runtime) newAppcRuntimeApp(pod *v1.Pod, podIP string, c v1.Container, requiresPrivileged bool, pullSecrets []v1.Secret, manifest *appcschema.PodManifest) error { var annotations appctypes.Annotations = []appctypes.Annotation{ { Name: *appctypes.MustACIdentifier(k8sRktContainerHashAnno), Value: strconv.FormatUint(kubecontainer.HashContainerLegacy(&c), 10), }, { Name: *appctypes.MustACIdentifier(types.KubernetesContainerNameLabel), Value: c.Name, }, } if requiresPrivileged && !securitycontext.HasPrivilegedRequest(&c) { return fmt.Errorf("cannot make %q: running a custom stage1 requires a privileged security context", format.Pod(pod)) } imageRef, _, err := r.imagePuller.EnsureImageExists(pod, &c, pullSecrets) if err != nil { return err } imgManifest, err := r.getImageManifest(c.Image) if err != nil { return err } if imgManifest.App == nil { imgManifest.App = new(appctypes.App) } hash, err := appctypes.NewHash(imageRef) if err != nil { return err } // TODO: determine how this should be handled for rkt opts, err := r.runtimeHelper.GenerateRunContainerOptions(pod, &c, podIP) if err != nil { return err } // Create additional mount for termination message path. mount, err := r.makeContainerLogMount(opts, &c) if err != nil { return err } mounts := append(opts.Mounts, *mount) annotations = append(annotations, appctypes.Annotation{ Name: *appctypes.MustACIdentifier(k8sRktTerminationMessagePathAnno), Value: mount.HostPath, }) // If run in 'hostnetwork' mode, then copy the host's /etc/resolv.conf and /etc/hosts, // and add mounts. if kubecontainer.IsHostNetworkPod(pod) { hostsMount, resolvMount, err := makeHostNetworkMount(opts) if err != nil { return err } mounts = append(mounts, *hostsMount, *resolvMount) } supplementalGids := r.runtimeHelper.GetExtraSupplementalGroupsForPod(pod) ctx := securitycontext.DetermineEffectiveSecurityContext(pod, &c) volumes, mountPoints := convertKubeMounts(mounts) containerPorts, hostPorts := convertKubePortMappings(opts.PortMappings) if err := setApp(imgManifest, &c, mountPoints, containerPorts, opts.Envs, ctx, pod.Spec.SecurityContext, supplementalGids); err != nil { return err } ra := appcschema.RuntimeApp{ Name: convertToACName(c.Name), Image: appcschema.RuntimeImage{ID: *hash}, App: imgManifest.App, Annotations: annotations, } if c.SecurityContext != nil && c.SecurityContext.ReadOnlyRootFilesystem != nil { ra.ReadOnlyRootFS = *c.SecurityContext.ReadOnlyRootFilesystem } manifest.Apps = append(manifest.Apps, ra) manifest.Volumes = append(manifest.Volumes, volumes...) manifest.Ports = append(manifest.Ports, hostPorts...) return nil } func runningKubernetesPodFilters(uid kubetypes.UID) []*rktapi.PodFilter { return []*rktapi.PodFilter{ { States: []rktapi.PodState{ rktapi.PodState_POD_STATE_RUNNING, }, Annotations: []*rktapi.KeyValue{ { Key: k8sRktKubeletAnno, Value: k8sRktKubeletAnnoValue, }, { Key: types.KubernetesPodUIDLabel, Value: string(uid), }, }, }, } } func kubernetesPodFilters(uid kubetypes.UID) []*rktapi.PodFilter { return []*rktapi.PodFilter{ { Annotations: []*rktapi.KeyValue{ { Key: k8sRktKubeletAnno, Value: k8sRktKubeletAnnoValue, }, { Key: types.KubernetesPodUIDLabel, Value: string(uid), }, }, }, } } func kubernetesPodsFilters() []*rktapi.PodFilter { return []*rktapi.PodFilter{ { Annotations: []*rktapi.KeyValue{ { Key: k8sRktKubeletAnno, Value: k8sRktKubeletAnnoValue, }, }, }, } } func newUnitOption(section, name, value string) *unit.UnitOption { return &unit.UnitOption{Section: section, Name: name, Value: value} } // apiPodToruntimePod converts an v1.Pod to kubelet/container.Pod. func apiPodToruntimePod(uuid string, pod *v1.Pod) *kubecontainer.Pod { p := &kubecontainer.Pod{ ID: pod.UID, Name: pod.Name, Namespace: pod.Namespace, } for i := range pod.Spec.Containers { c := &pod.Spec.Containers[i] p.Containers = append(p.Containers, &kubecontainer.Container{ ID: buildContainerID(&containerID{uuid, c.Name}), Name: c.Name, Image: c.Image, Hash: kubecontainer.HashContainerLegacy(c), }) } return p } // serviceFilePath returns the absolute path of the service file. func serviceFilePath(serviceName string) string { return path.Join(systemdServiceDir, serviceName) } // shouldCreateNetns returns true if: // The pod does not run in host network. And // The pod runs inside a netns created outside of rkt. func (r *Runtime) shouldCreateNetns(pod *v1.Pod) bool { return !kubecontainer.IsHostNetworkPod(pod) && r.network.PluginName() != network.DefaultPluginName } // usesRktHostNetwork returns true if: // The pod runs in the host network. Or // The pod runs inside a netns created outside of rkt. func (r *Runtime) usesRktHostNetwork(pod *v1.Pod) bool { return kubecontainer.IsHostNetworkPod(pod) || r.shouldCreateNetns(pod) } // generateRunCommand crafts a 'rkt run-prepared' command with necessary parameters. func (r *Runtime) generateRunCommand(pod *v1.Pod, uuid, networkNamespaceID string) (string, error) { config := *r.config privileged := true for _, c := range pod.Spec.Containers { ctx := securitycontext.DetermineEffectiveSecurityContext(pod, &c) if ctx == nil || ctx.Privileged == nil || *ctx.Privileged == false { privileged = false break } } // Use "all-run" insecure option (https://github.com/coreos/rkt/pull/2983) to take care // of privileged pod. // TODO(yifan): Have more granular app-level control of the insecure options. // See: https://github.com/coreos/rkt/issues/2996. if privileged { config.InsecureOptions = fmt.Sprintf("%s,%s", config.InsecureOptions, "all-run") } runPrepared := buildCommand(&config, "run-prepared").Args var hostname string var err error osInfos, err := getOSReleaseInfo() if err != nil { glog.Warningf("rkt: Failed to read the os release info: %v", err) } else { // Overlay fs is not supported for SELinux yet on many distros. // See https://github.com/coreos/rkt/issues/1727#issuecomment-173203129. // For now, coreos carries a patch to support it: https://github.com/coreos/coreos-overlay/pull/1703 if osInfos["ID"] != "coreos" && pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SELinuxOptions != nil { runPrepared = append(runPrepared, "--no-overlay=true") } } // Apply '--net=host' to pod that is running on host network or inside a network namespace. if r.usesRktHostNetwork(pod) { runPrepared = append(runPrepared, "--net=host") } else { runPrepared = append(runPrepared, fmt.Sprintf("--net=%s", defaultNetworkName)) } if kubecontainer.IsHostNetworkPod(pod) { // TODO(yifan): Let runtimeHelper.GeneratePodHostNameAndDomain() to handle this. hostname, err = r.os.Hostname() if err != nil { return "", err } } else { // Setup DNS. dnsConfig, err := r.runtimeHelper.GetPodDNS(pod) if err != nil { return "", err } for _, server := range dnsConfig.Servers { runPrepared = append(runPrepared, fmt.Sprintf("--dns=%s", server)) } for _, search := range dnsConfig.Searches { runPrepared = append(runPrepared, fmt.Sprintf("--dns-search=%s", search)) } if len(dnsConfig.Servers) > 0 || len(dnsConfig.Searches) > 0 { runPrepared = append(runPrepared, fmt.Sprintf("--dns-opt=%s", defaultDNSOption)) } // TODO(yifan): host domain is not being used. hostname, _, err = r.runtimeHelper.GeneratePodHostNameAndDomain(pod) if err != nil { return "", err } } runPrepared = append(runPrepared, fmt.Sprintf("--hostname=%s", hostname)) runPrepared = append(runPrepared, uuid) if r.shouldCreateNetns(pod) { // Drop the `rkt run-prepared` into the network namespace we // created. // TODO: switch to 'ip netns exec' once we can depend on a new // enough version that doesn't have bugs like // https://bugzilla.redhat.com/show_bug.cgi?id=882047 nsenterExec := []string{r.nsenterPath, "--net=" + netnsPathFromName(networkNamespaceID), "--"} runPrepared = append(nsenterExec, runPrepared...) } return strings.Join(runPrepared, " "), nil } func (r *Runtime) cleanupPodNetwork(pod *v1.Pod, networkNamespace kubecontainer.ContainerID) error { // No-op if the pod is not running in a created netns. if !r.shouldCreateNetns(pod) { return nil } glog.V(3).Infof("Calling network plugin %s to tear down pod for %s", r.network.PluginName(), format.Pod(pod)) teardownErr := r.network.TearDownPod(pod.Namespace, pod.Name, networkNamespace) if teardownErr != nil { glog.Error(teardownErr) } if _, err := r.execer.Command("ip", "netns", "del", networkNamespace.ID).Output(); err != nil { return fmt.Errorf("rkt: Failed to remove network namespace for pod %s: %v", format.Pod(pod), err) } return teardownErr } func (r *Runtime) preparePodArgs(manifest *appcschema.PodManifest, manifestFileName string) []string { // Order of precedence for the stage1: // 1) pod annotation (stage1 name) // 2) kubelet configured stage1 (stage1 path) // 3) empty; whatever rkt's compiled to default to stage1ImageCmd := "" if r.config.Stage1Image != "" { stage1ImageCmd = "--stage1-name=" + r.config.Stage1Image } if stage1Name, ok := manifest.Annotations.Get(k8sRktStage1NameAnno); ok { stage1ImageCmd = "--stage1-name=" + stage1Name } // Run 'rkt prepare' to get the rkt UUID. cmds := []string{"prepare", "--quiet", "--pod-manifest", manifestFileName} if stage1ImageCmd != "" { cmds = append(cmds, stage1ImageCmd) } return cmds } func (r *Runtime) getSelinuxContext(opt *v1.SELinuxOptions) (string, error) { selinuxRunner := selinux.NewSELinuxRunner() str, err := selinuxRunner.Getfilecon(r.config.Dir) if err != nil { return "", err } ctx := strings.SplitN(str, ":", 4) if len(ctx) != 4 { return "", fmt.Errorf("malformated selinux context") } if opt.User != "" { ctx[0] = opt.User } if opt.Role != "" { ctx[1] = opt.Role } if opt.Type != "" { ctx[2] = opt.Type } if opt.Level != "" { ctx[3] = opt.Level } return strings.Join(ctx, ":"), nil } // From the generateName or the podName return a basename for improving the logging with the Journal // journalctl -t podBaseName func constructSyslogIdentifier(generateName string, podName string) string { if len(generateName) > 1 && generateName[len(generateName)-1] == '-' { return generateName[0 : len(generateName)-1] } if len(generateName) > 0 { return generateName } return podName } // Setup additional systemd field specified in the Pod Annotation func setupSystemdCustomFields(annotations map[string]string, unitOptionArray []*unit.UnitOption) ([]*unit.UnitOption, error) { // LimitNOFILE if strSize := annotations[k8sRktLimitNoFileAnno]; strSize != "" { size, err := strconv.Atoi(strSize) if err != nil { return unitOptionArray, err } if size < 1 { return unitOptionArray, fmt.Errorf("invalid value for %s: %s", k8sRktLimitNoFileAnno, strSize) } unitOptionArray = append(unitOptionArray, newUnitOption("Service", "LimitNOFILE", strSize)) } return unitOptionArray, nil } // preparePod will: // // 1. Invoke 'rkt prepare' to prepare the pod, and get the rkt pod uuid. // 2. Create the unit file and save it under systemdUnitDir. // // On success, it will return a string that represents name of the unit file // and the runtime pod. func (r *Runtime) preparePod(pod *v1.Pod, podIP string, pullSecrets []v1.Secret, networkNamespaceID string) (string, *kubecontainer.Pod, error) { // Generate the appc pod manifest from the k8s pod spec. manifest, err := r.makePodManifest(pod, podIP, pullSecrets) if err != nil { return "", nil, err } manifestFile, err := ioutil.TempFile("", fmt.Sprintf("manifest-%s-", pod.Name)) if err != nil { return "", nil, err } defer func() { manifestFile.Close() if err := r.os.Remove(manifestFile.Name()); err != nil { glog.Warningf("rkt: Cannot remove temp manifest file %q: %v", manifestFile.Name(), err) } }() data, err := json.Marshal(manifest) if err != nil { return "", nil, err } glog.V(4).Infof("Generating pod manifest for pod %q: %v", format.Pod(pod), string(data)) // Since File.Write returns error if the written length is less than len(data), // so check error is enough for us. if _, err := manifestFile.Write(data); err != nil { return "", nil, err } prepareCmd := r.preparePodArgs(manifest, manifestFile.Name()) output, err := r.cli.RunCommand(nil, prepareCmd...) if err != nil { return "", nil, err } if len(output) != 1 { return "", nil, fmt.Errorf("invalid output from 'rkt prepare': %v", output) } uuid := output[0] glog.V(4).Infof("'rkt prepare' returns %q", uuid) // Create systemd service file for the rkt pod. runPrepared, err := r.generateRunCommand(pod, uuid, networkNamespaceID) if err != nil { return "", nil, fmt.Errorf("failed to generate 'rkt run-prepared' command: %v", err) } // TODO handle pod.Spec.HostPID // TODO handle pod.Spec.HostIPC // TODO per container finishedAt, not just per pod markPodFinished := podFinishedMarkCommand(r.touchPath, r.runtimeHelper.GetPodDir(pod.UID), uuid) hostNetwork := kubecontainer.IsHostNetworkPod(pod) units := []*unit.UnitOption{ newUnitOption("Service", "ExecStart", runPrepared), newUnitOption("Service", "ExecStopPost", markPodFinished), // This enables graceful stop. newUnitOption("Service", "KillMode", "mixed"), newUnitOption("Service", "TimeoutStopSec", fmt.Sprintf("%ds", getPodTerminationGracePeriodInSecond(pod))), // Ops helpers newUnitOption("Unit", "Description", pod.Name), newUnitOption("Service", "SyslogIdentifier", constructSyslogIdentifier(pod.GenerateName, pod.Name)), // Track pod info for garbage collection newUnitOption(unitKubernetesSection, unitPodUID, string(pod.UID)), newUnitOption(unitKubernetesSection, unitPodName, pod.Name), newUnitOption(unitKubernetesSection, unitPodNamespace, pod.Namespace), newUnitOption(unitKubernetesSection, unitPodHostNetwork, fmt.Sprintf("%v", hostNetwork)), newUnitOption(unitKubernetesSection, unitPodNetworkNamespace, networkNamespaceID), } if pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SELinuxOptions != nil { opt := pod.Spec.SecurityContext.SELinuxOptions selinuxContext, err := r.getSelinuxContext(opt) if err != nil { glog.Errorf("rkt: Failed to construct selinux context with selinux option %q: %v", opt, err) return "", nil, err } units = append(units, newUnitOption("Service", "SELinuxContext", selinuxContext)) } units, err = setupSystemdCustomFields(pod.Annotations, units) if err != nil { glog.Warningf("fail to add custom systemd fields provided by pod Annotations: %q", err) } serviceName := makePodServiceFileName(uuid) glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, format.Pod(pod)) serviceFile, err := r.os.Create(serviceFilePath(serviceName)) if err != nil { return "", nil, err } if _, err := io.Copy(serviceFile, unit.Serialize(units)); err != nil { return "", nil, err } serviceFile.Close() return serviceName, apiPodToruntimePod(uuid, pod), nil } // generateEvents is a helper function that generates some container // life cycle events for containers in a pod. func (r *Runtime) generateEvents(runtimePod *kubecontainer.Pod, reason string, failure error) { // Set up container references. for _, c := range runtimePod.Containers { containerID := c.ID id, err := parseContainerID(containerID) if err != nil { glog.Warningf("Invalid container ID %q", containerID) continue } ref, ok := r.containerRefManager.GetRef(containerID) if !ok { glog.Warningf("No ref for container %q", containerID) continue } // Note that 'rkt id' is the pod id. uuid := utilstrings.ShortenString(id.uuid, 8) switch reason { case "Created": r.recorder.Eventf(ref, v1.EventTypeNormal, events.CreatedContainer, "Created with rkt id %v", uuid) case "Started": r.recorder.Eventf(ref, v1.EventTypeNormal, events.StartedContainer, "Started with rkt id %v", uuid) case "Failed": r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer, "Failed to start with rkt id %v with error %v", uuid, failure) case "Killing": r.recorder.Eventf(ref, v1.EventTypeNormal, events.KillingContainer, "Killing with rkt id %v", uuid) default: glog.Errorf("rkt: Unexpected event %q", reason) } } return } // Generate a Network Namespace based on a New UUID // to run the Pod and all of its containers inside a dedicated unique namespace func generateNetworkNamespaceUUID() kubecontainer.ContainerID { return kubecontainer.ContainerID{ID: fmt.Sprintf("%s%s", kubernetesUnitPrefix, uuid.NewUUID())} } func netnsPathFromName(netnsName string) string { return fmt.Sprintf("/var/run/netns/%s", netnsName) } // setupPodNetwork creates a network namespace for the given pod and calls // configured NetworkPlugin's setup function on it. // It returns the namespace name, configured IP (if available), and an error if // one occurred. // // If the pod is running in host network or is running using the no-op plugin, then nothing will be done. func (r *Runtime) setupPodNetwork(pod *v1.Pod) (kubecontainer.ContainerID, string, error) { glog.V(3).Infof("Calling network plugin %s to set up pod for %s", r.network.PluginName(), format.Pod(pod)) var networkNamespace kubecontainer.ContainerID // No-op if the pod is not running in a created netns. if !r.shouldCreateNetns(pod) { return networkNamespace, "", nil } networkNamespace = generateNetworkNamespaceUUID() glog.V(5).Infof("New network namespace %q generated for pod %s", networkNamespace.ID, format.Pod(pod)) // Create the network namespace for the pod _, err := r.execer.Command("ip", "netns", "add", networkNamespace.ID).Output() if err != nil { return networkNamespace, "", fmt.Errorf("failed to create pod network namespace: %v", err) } // Set up networking with the network plugin err = r.network.SetUpPod(pod.Namespace, pod.Name, networkNamespace, pod.Annotations) if err != nil { return networkNamespace, "", err } status, err := r.network.GetPodNetworkStatus(pod.Namespace, pod.Name, networkNamespace) if err != nil { return networkNamespace, "", err } if r.configureHairpinMode { if err = hairpin.SetUpContainerPath(netnsPathFromName(networkNamespace.ID), network.DefaultInterfaceName); err != nil { glog.Warningf("Hairpin setup failed for pod %q: %v", format.Pod(pod), err) } } return networkNamespace, status.IP.String(), nil } // For a hostPath volume: rkt doesn't create any missing volume on the node/host so we need to create it func createHostPathVolumes(pod *v1.Pod) (err error) { for _, v := range pod.Spec.Volumes { if v.VolumeSource.HostPath != nil { _, err = os.Stat(v.HostPath.Path) if os.IsNotExist(err) { if err = os.MkdirAll(v.HostPath.Path, os.ModePerm); err != nil { glog.Errorf("Create volume HostPath %q for Pod %q failed: %q", v.HostPath.Path, format.Pod(pod), err.Error()) return err } glog.V(4).Infof("Created volume HostPath %q for Pod %q", v.HostPath.Path, format.Pod(pod)) } } } return nil } // RunPod first creates the unit file for a pod, and then // starts the unit over d-bus. func (r *Runtime) RunPod(pod *v1.Pod, pullSecrets []v1.Secret) error { glog.V(4).Infof("Rkt starts to run pod: name %q.", format.Pod(pod)) var err error var networkNamespace kubecontainer.ContainerID var podIP string err = createHostPathVolumes(pod) if err != nil { return err } networkNamespace, podIP, err = r.setupPodNetwork(pod) if err != nil { r.cleanupPodNetwork(pod, networkNamespace) return err } name, runtimePod, prepareErr := r.preparePod(pod, podIP, pullSecrets, networkNamespace.ID) // Set container references and generate events. // If preparedPod fails, then send out 'failed' events for each container. // Otherwise, store the container references so we can use them later to send events. for i, c := range pod.Spec.Containers { ref, err := kubecontainer.GenerateContainerRef(pod, &c) if err != nil { glog.Errorf("Couldn't make a ref to pod %q, container %v: '%v'", format.Pod(pod), c.Name, err) continue } if prepareErr != nil { r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create rkt container with error: %v", prepareErr) continue } containerID := runtimePod.Containers[i].ID r.containerRefManager.SetRef(containerID, ref) } if prepareErr != nil { r.cleanupPodNetwork(pod, networkNamespace) return prepareErr } r.generateEvents(runtimePod, "Created", nil) // RestartUnit has the same effect as StartUnit if the unit is not running, besides it can restart // a unit if the unit file is changed and reloaded. reschan := make(chan string) _, err = r.systemd.RestartUnit(name, "replace", reschan) if err != nil { r.generateEvents(runtimePod, "Failed", err) r.cleanupPodNetwork(pod, networkNamespace) return err } res := <-reschan if res != "done" { err := fmt.Errorf("Failed to restart unit %q: %s", name, res) r.generateEvents(runtimePod, "Failed", err) r.cleanupPodNetwork(pod, networkNamespace) return err } r.generateEvents(runtimePod, "Started", nil) // This is a temporary solution until we have a clean design on how // kubelet handles events. See https://github.com/kubernetes/kubernetes/issues/23084. if err := r.runLifecycleHooks(pod, runtimePod, lifecyclePostStartHook); err != nil { if errKill := r.KillPod(pod, *runtimePod, nil); errKill != nil { return errors.NewAggregate([]error{err, errKill}) } r.cleanupPodNetwork(pod, networkNamespace) return err } return nil } func (r *Runtime) runPreStopHook(containerID kubecontainer.ContainerID, pod *v1.Pod, container *v1.Container) error { glog.V(4).Infof("rkt: Running pre-stop hook for container %q of pod %q", container.Name, format.Pod(pod)) msg, err := r.runner.Run(containerID, pod, container, container.Lifecycle.PreStop) if err != nil { ref, ok := r.containerRefManager.GetRef(containerID) if !ok { glog.Warningf("No ref for container %q", containerID) } else { r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedPreStopHook, msg) } } return err } func (r *Runtime) runPostStartHook(containerID kubecontainer.ContainerID, pod *v1.Pod, container *v1.Container) error { glog.V(4).Infof("rkt: Running post-start hook for container %q of pod %q", container.Name, format.Pod(pod)) cid, err := parseContainerID(containerID) if err != nil { return fmt.Errorf("cannot parse container ID %v", containerID) } isContainerRunning := func() (done bool, err error) { ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() resp, err := r.apisvc.InspectPod(ctx, &rktapi.InspectPodRequest{Id: cid.uuid}) if err != nil { return false, fmt.Errorf("failed to inspect rkt pod %q for pod %q", cid.uuid, format.Pod(pod)) } for _, app := range resp.Pod.Apps { if app.Name == cid.appName { return app.State == rktapi.AppState_APP_STATE_RUNNING, nil } } return false, fmt.Errorf("failed to find container %q in rkt pod %q", cid.appName, cid.uuid) } // TODO(yifan): Polling the pod's state for now. timeout := time.Second * 5 pollInterval := time.Millisecond * 500 if err := utilwait.Poll(pollInterval, timeout, isContainerRunning); err != nil { return fmt.Errorf("rkt: Pod %q doesn't become running in %v: %v", format.Pod(pod), timeout, err) } msg, err := r.runner.Run(containerID, pod, container, container.Lifecycle.PostStart) if err != nil { ref, ok := r.containerRefManager.GetRef(containerID) if !ok { glog.Warningf("No ref for container %q", containerID) } else { r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedPostStartHook, msg) } } return err } type lifecycleHookType string const ( lifecyclePostStartHook lifecycleHookType = "post-start" lifecyclePreStopHook lifecycleHookType = "pre-stop" ) func (r *Runtime) runLifecycleHooks(pod *v1.Pod, runtimePod *kubecontainer.Pod, typ lifecycleHookType) error { var wg sync.WaitGroup var errlist []error errCh := make(chan error, len(pod.Spec.Containers)) wg.Add(len(pod.Spec.Containers)) for i, c := range pod.Spec.Containers { var hookFunc func(kubecontainer.ContainerID, *v1.Pod, *v1.Container) error switch typ { case lifecyclePostStartHook: if c.Lifecycle != nil && c.Lifecycle.PostStart != nil { hookFunc = r.runPostStartHook } case lifecyclePreStopHook: if c.Lifecycle != nil && c.Lifecycle.PreStop != nil { hookFunc = r.runPreStopHook } default: errCh <- fmt.Errorf("Unrecognized lifecycle hook type %q for container %q in pod %q", typ, c.Name, format.Pod(pod)) } if hookFunc == nil { wg.Done() continue } container := &pod.Spec.Containers[i] runtimeContainer := runtimePod.FindContainerByName(container.Name) if runtimeContainer == nil { // Container already gone. wg.Done() continue } containerID := runtimeContainer.ID go func() { defer wg.Done() if err := hookFunc(containerID, pod, container); err != nil { glog.Errorf("rkt: Failed to run %s hook for container %q of pod %q: %v", typ, container.Name, format.Pod(pod), err) errCh <- err } else { glog.V(4).Infof("rkt: %s hook completed successfully for container %q of pod %q", typ, container.Name, format.Pod(pod)) } }() } wg.Wait() close(errCh) for err := range errCh { errlist = append(errlist, err) } return errors.NewAggregate(errlist) } // convertRktPod will convert a rktapi.Pod to a kubecontainer.Pod func (r *Runtime) convertRktPod(rktpod *rktapi.Pod) (*kubecontainer.Pod, error) { manifest := &appcschema.PodManifest{} err := json.Unmarshal(rktpod.Manifest, manifest) if err != nil { return nil, err } podUID, ok := manifest.Annotations.Get(types.KubernetesPodUIDLabel) if !ok { return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodUIDLabel) } podName, ok := manifest.Annotations.Get(types.KubernetesPodNameLabel) if !ok { return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodNameLabel) } podNamespace, ok := manifest.Annotations.Get(types.KubernetesPodNamespaceLabel) if !ok { return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodNamespaceLabel) } kubepod := &kubecontainer.Pod{ ID: kubetypes.UID(podUID), Name: podName, Namespace: podNamespace, } for i, app := range rktpod.Apps { // The order of the apps is determined by the rkt pod manifest. // TODO(yifan): Let the server to unmarshal the annotations? https://github.com/coreos/rkt/issues/1872 hashStr, ok := manifest.Apps[i].Annotations.Get(k8sRktContainerHashAnno) if !ok { return nil, fmt.Errorf("app %q is missing annotation %s", app.Name, k8sRktContainerHashAnno) } containerHash, err := strconv.ParseUint(hashStr, 10, 64) if err != nil { return nil, fmt.Errorf("couldn't parse container's hash %q: %v", hashStr, err) } kubepod.Containers = append(kubepod.Containers, &kubecontainer.Container{ ID: buildContainerID(&containerID{rktpod.Id, app.Name}), Name: app.Name, // By default, the version returned by rkt API service will be "latest" if not specified. Image: fmt.Sprintf("%s:%s", app.Image.Name, app.Image.Version), ImageID: app.Image.Id, Hash: containerHash, State: appStateToContainerState(app.State), }) } return kubepod, nil } // GetPods runs 'rkt list' to get the list of rkt pods. // Then it will use the result to construct a list of container runtime pods. // If all is false, then only running pods will be returned, otherwise all pods will be // returned. func (r *Runtime) GetPods(all bool) ([]*kubecontainer.Pod, error) { glog.V(4).Infof("Rkt getting pods") listReq := &rktapi.ListPodsRequest{ Detail: true, Filters: []*rktapi.PodFilter{ { Annotations: []*rktapi.KeyValue{ { Key: k8sRktKubeletAnno, Value: k8sRktKubeletAnnoValue, }, }, }, }, } if !all { listReq.Filters[0].States = []rktapi.PodState{rktapi.PodState_POD_STATE_RUNNING} } ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() listResp, err := r.apisvc.ListPods(ctx, listReq) if err != nil { return nil, fmt.Errorf("couldn't list pods: %v", err) } pods := make(map[kubetypes.UID]*kubecontainer.Pod) var podIDs []kubetypes.UID for _, pod := range listResp.Pods { pod, err := r.convertRktPod(pod) if err != nil { glog.Warningf("rkt: Cannot construct pod from unit file: %v.", err) continue } // Group pods together. oldPod, found := pods[pod.ID] if !found { pods[pod.ID] = pod podIDs = append(podIDs, pod.ID) continue } oldPod.Containers = append(oldPod.Containers, pod.Containers...) } // Convert map to list, using the consistent order from the podIDs array. var result []*kubecontainer.Pod for _, id := range podIDs { result = append(result, pods[id]) } return result, nil } func getPodTerminationGracePeriodInSecond(pod *v1.Pod) int64 { var gracePeriod int64 switch { case pod.DeletionGracePeriodSeconds != nil: gracePeriod = *pod.DeletionGracePeriodSeconds case pod.Spec.TerminationGracePeriodSeconds != nil: gracePeriod = *pod.Spec.TerminationGracePeriodSeconds } if gracePeriod < minimumGracePeriodInSeconds { gracePeriod = minimumGracePeriodInSeconds } return gracePeriod } func (r *Runtime) waitPreStopHooks(pod *v1.Pod, runningPod *kubecontainer.Pod) { gracePeriod := getPodTerminationGracePeriodInSecond(pod) done := make(chan struct{}) go func() { if err := r.runLifecycleHooks(pod, runningPod, lifecyclePreStopHook); err != nil { glog.Errorf("rkt: Some pre-stop hooks failed for pod %q: %v", format.Pod(pod), err) } close(done) }() select { case <-time.After(time.Duration(gracePeriod) * time.Second): glog.V(2).Infof("rkt: Some pre-stop hooks did not complete in %d seconds for pod %q", gracePeriod, format.Pod(pod)) case <-done: } } // KillPod invokes 'systemctl kill' to kill the unit that runs the pod. // TODO: add support for gracePeriodOverride which is used in eviction scenarios func (r *Runtime) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error { glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name) if len(runningPod.Containers) == 0 { glog.V(4).Infof("rkt: Pod %q is already being killed, no action will be taken", runningPod.Name) return nil } if pod != nil { r.waitPreStopHooks(pod, &runningPod) } containerID, err := parseContainerID(runningPod.Containers[0].ID) if err != nil { glog.Errorf("rkt: Failed to get rkt uuid of the pod %q: %v", runningPod.Name, err) return err } serviceName := makePodServiceFileName(containerID.uuid) serviceFile := serviceFilePath(serviceName) r.generateEvents(&runningPod, "Killing", nil) for _, c := range runningPod.Containers { r.containerRefManager.ClearRef(c.ID) } // Since all service file have 'KillMode=mixed', the processes in // the unit's cgroup will receive a SIGKILL if the normal stop timeouts. reschan := make(chan string) if _, err = r.systemd.StopUnit(serviceName, "replace", reschan); err != nil { glog.Errorf("rkt: Failed to stop unit %q: %v", serviceName, err) return err } res := <-reschan if res != "done" { err := fmt.Errorf("invalid result: %s", res) glog.Errorf("rkt: Failed to stop unit %q: %v", serviceName, err) return err } // Clean up networking. Use the service file to get pod details since 'pod' can be nil. if err := r.cleanupPodNetworkFromServiceFile(serviceFile); err != nil { glog.Errorf("rkt: failed to tear down network for unit %q: %v", serviceName, err) return err } return nil } func (r *Runtime) Type() string { return RktType } func (r *Runtime) Version() (kubecontainer.Version, error) { r.versions.RLock() defer r.versions.RUnlock() return r.versions.binVersion, nil } func (r *Runtime) APIVersion() (kubecontainer.Version, error) { r.versions.RLock() defer r.versions.RUnlock() return r.versions.apiVersion, nil } // Status returns error if rkt is unhealthy, nil otherwise. func (r *Runtime) Status() (*kubecontainer.RuntimeStatus, error) { return nil, r.checkVersion(minimumRktBinVersion, minimumRktApiVersion, minimumSystemdVersion) } // SyncPod syncs the running pod to match the specified desired pod. func (r *Runtime) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) { var err error defer func() { if err != nil { result.Fail(err) } }() // TODO: (random-liu) Stop using running pod in SyncPod() runningPod := kubecontainer.ConvertPodStatusToRunningPod(r.Type(), podStatus) // Add references to all containers. unidentifiedContainers := make(map[kubecontainer.ContainerID]*kubecontainer.Container) for _, c := range runningPod.Containers { unidentifiedContainers[c.ID] = c } restartPod := false for _, container := range pod.Spec.Containers { expectedHash := kubecontainer.HashContainerLegacy(&container) c := runningPod.FindContainerByName(container.Name) if c == nil { if kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) { glog.V(3).Infof("Container %+v is dead, but RestartPolicy says that we should restart it.", container) // TODO(yifan): Containers in one pod are fate-sharing at this moment, see: // https://github.com/appc/spec/issues/276. restartPod = true break } continue } // TODO: check for non-root image directives. See ../docker/manager.go#SyncPod // TODO(yifan): Take care of host network change. containerChanged := c.Hash != 0 && c.Hash != expectedHash if containerChanged { glog.Infof("Pod %q container %q hash changed (%d vs %d), it will be killed and re-created.", format.Pod(pod), container.Name, c.Hash, expectedHash) restartPod = true break } liveness, found := r.livenessManager.Get(c.ID) if found && liveness != proberesults.Success && pod.Spec.RestartPolicy != v1.RestartPolicyNever { glog.Infof("Pod %q container %q is unhealthy, it will be killed and re-created.", format.Pod(pod), container.Name) restartPod = true break } delete(unidentifiedContainers, c.ID) } // If there is any unidentified containers, restart the pod. if len(unidentifiedContainers) > 0 { restartPod = true } if restartPod { // Kill the pod only if the pod is actually running. if len(runningPod.Containers) > 0 { if err = r.KillPod(pod, runningPod, nil); err != nil { return } } if err = r.RunPod(pod, pullSecrets); err != nil { return } } return } // Sort rkt pods by creation time. type podsByCreatedAt []*rktapi.Pod func (s podsByCreatedAt) Len() int { return len(s) } func (s podsByCreatedAt) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s podsByCreatedAt) Less(i, j int) bool { return s[i].CreatedAt < s[j].CreatedAt } // getPodUID returns the pod's API UID, it returns // empty UID if the UID cannot be determined. func getPodUID(pod *rktapi.Pod) kubetypes.UID { for _, anno := range pod.Annotations { if anno.Key == types.KubernetesPodUIDLabel { return kubetypes.UID(anno.Value) } } return kubetypes.UID("") } // podIsActive returns true if the pod is embryo, preparing or running. // If a pod is prepared, it is not guaranteed to be active (e.g. the systemd // service might fail). func podIsActive(pod *rktapi.Pod) bool { return pod.State == rktapi.PodState_POD_STATE_EMBRYO || pod.State == rktapi.PodState_POD_STATE_PREPARING || pod.State == rktapi.PodState_POD_STATE_RUNNING } // GetNetNS returns the network namespace path for the given container func (r *Runtime) GetNetNS(containerID kubecontainer.ContainerID) (string, error) { // Currently the containerID is a UUID for a network namespace // This hack is a way to create an unique network namespace for each new starting/restarting Pod // We can do this because we played the same trick in // `networkPlugin.SetUpPod` and `networkPlugin.TearDownPod`. // See https://github.com/kubernetes/kubernetes/issues/45149 return netnsPathFromName(containerID.ID), nil } func (r *Runtime) GetPodContainerID(pod *kubecontainer.Pod) (kubecontainer.ContainerID, error) { return kubecontainer.ContainerID{ID: string(pod.ID)}, nil } func (r *Runtime) getKubernetesDirective(serviceFilePath string) (podService podServiceDirective, err error) { f, err := os.Open(serviceFilePath) if err != nil { return podService, err } defer f.Close() opts, err := unit.Deserialize(f) if err != nil { return podService, err } var hostnetwork, networkNamespace string for _, o := range opts { if o.Section != unitKubernetesSection { continue } switch o.Name { case unitPodUID: podService.id = o.Value case unitPodName: podService.name = o.Value case unitPodNamespace: podService.namespace = o.Value case unitPodHostNetwork: hostnetwork = o.Value case unitPodNetworkNamespace: networkNamespace = o.Value } if podService.id != "" && podService.name != "" && podService.namespace != "" && hostnetwork != "" && networkNamespace != "" { podService.hostNetwork, err = strconv.ParseBool(hostnetwork) podService.networkNamespace = kubecontainer.ContainerID{ID: networkNamespace} if err != nil { return podService, err } return podService, nil } } return podService, fmt.Errorf("failed to parse pod from file %s", serviceFilePath) } func (r *Runtime) DeleteContainer(containerID kubecontainer.ContainerID) error { return fmt.Errorf("unimplemented") } // Collects all the systemd units for k8s Pods func (r *Runtime) getPodSystemdServiceFiles() ([]os.FileInfo, error) { // Get all the current units files, err := r.os.ReadDir(systemdServiceDir) if err != nil { glog.Errorf("rkt: Failed to read the systemd service directory: %v", err) return files, err } // Keep only k8s unit files k8sSystemdServiceFiles := files[:0] for _, f := range files { if strings.HasPrefix(f.Name(), kubernetesUnitPrefix) { k8sSystemdServiceFiles = append(k8sSystemdServiceFiles, f) } } return k8sSystemdServiceFiles, err } // GarbageCollect collects the pods/containers. // After one GC iteration: // - The deleted pods will be removed. // - If the number of containers exceeds gcPolicy.MaxContainers, // then containers whose ages are older than gcPolicy.minAge will // be removed. func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, _ bool) error { var errlist []error var totalInactiveContainers int var inactivePods []*rktapi.Pod var removeCandidates []*rktapi.Pod var allPods = map[string]*rktapi.Pod{} glog.V(4).Infof("rkt: Garbage collecting triggered with policy %v", gcPolicy) // GC all inactive systemd service files and pods. files, err := r.getPodSystemdServiceFiles() if err != nil { return err } ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() resp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{Filters: kubernetesPodsFilters()}) if err != nil { glog.Errorf("rkt: Failed to list pods: %v", err) return err } // Mark inactive pods. for _, pod := range resp.Pods { allPods[pod.Id] = pod if !podIsActive(pod) { uid := getPodUID(pod) if uid == kubetypes.UID("") { glog.Errorf("rkt: Cannot get the UID of pod %q, pod is broken, will remove it", pod.Id) removeCandidates = append(removeCandidates, pod) continue } if r.podDeletionProvider.IsPodDeleted(uid) && allSourcesReady { removeCandidates = append(removeCandidates, pod) continue } inactivePods = append(inactivePods, pod) totalInactiveContainers = totalInactiveContainers + len(pod.Apps) } } // Remove any orphan service files. for _, f := range files { serviceName := f.Name() rktUUID := getRktUUIDFromServiceFileName(serviceName) if _, ok := allPods[rktUUID]; !ok { glog.V(4).Infof("rkt: No rkt pod found for service file %q, will remove it", serviceName) if err := r.cleanupByPodId(rktUUID); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up rkt pod %q: %v", rktUUID, err)) } } } sort.Sort(podsByCreatedAt(inactivePods)) // Enforce GCPolicy.MaxContainers. for _, pod := range inactivePods { if totalInactiveContainers <= gcPolicy.MaxContainers { break } creationTime := time.Unix(0, pod.CreatedAt) if creationTime.Add(gcPolicy.MinAge).Before(time.Now()) { // The pod is old and we are exceeding the MaxContainers limit. // Delete the pod. removeCandidates = append(removeCandidates, pod) totalInactiveContainers = totalInactiveContainers - len(pod.Apps) } } // Remove pods and their service files. for _, pod := range removeCandidates { if err := r.removePod(pod); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up rkt pod %q: %v", pod.Id, err)) } } return errors.NewAggregate(errlist) } // Read kubernetes pod UUID, namespace, netns and name from systemd service file and // use that to clean up any pod network that may still exist. func (r *Runtime) cleanupPodNetworkFromServiceFile(serviceFilePath string) error { podService, err := r.unitGetter.getKubernetesDirective(serviceFilePath) if err != nil { return err } return r.cleanupPodNetwork(&v1.Pod{ ObjectMeta: metav1.ObjectMeta{ UID: kubetypes.UID(podService.id), Name: podService.name, Namespace: podService.namespace, }, Spec: v1.PodSpec{ HostNetwork: podService.hostNetwork, }, }, podService.networkNamespace) } // Remove the touched file created by ExecStartPost in the systemd service file func (r *Runtime) removeFinishedMarkerFile(serviceName string) error { serviceFile := serviceFilePath(serviceName) podDetail, err := r.unitGetter.getKubernetesDirective(serviceFile) if err != nil { return err } podDir := r.runtimeHelper.GetPodDir(kubetypes.UID(podDetail.id)) finishedFile := podFinishedMarkerPath(podDir, getRktUUIDFromServiceFileName(serviceName)) return r.os.Remove(finishedFile) } // Iter over each container in the pod to delete its termination log file func (r *Runtime) removeTerminationFiles(pod *rktapi.Pod) (errlist []error) { // container == app for _, app := range pod.Apps { for _, annotation := range app.Annotations { if annotation.GetKey() == k8sRktTerminationMessagePathAnno { if err := r.os.Remove(annotation.GetValue()); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to remove for pod %q container file %v", pod.Id, err)) } } } } return errlist } func (r *Runtime) cleanupByPodId(podID string) (errlist []error) { serviceName := makePodServiceFileName(podID) serviceFile := serviceFilePath(serviceName) if err := r.cleanupPodNetworkFromServiceFile(serviceFile); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up pod network from service %q: %v, the network may not be around already", serviceName, err)) } // GC finished marker, termination-log file, systemd service files as well. if err := r.systemd.ResetFailedUnit(serviceName); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to reset the failed systemd service %q: %v", serviceName, err)) } if err := r.removeFinishedMarkerFile(serviceName); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to remove finished file %q for unit %q: %v", serviceName, podID, err)) } if err := r.os.Remove(serviceFile); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to remove service file %q for pod %q: %v", serviceFile, podID, err)) } return errlist } // removePod calls 'rkt rm $UUID' to delete a rkt pod, // it also remove the systemd service file, // the finished-* marker and the termination-log files // related to the pod. func (r *Runtime) removePod(pod *rktapi.Pod) error { var errlist []error glog.V(4).Infof("rkt: GC is removing pod %q", pod) if err := r.cleanupByPodId(pod.Id); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to remove pod %q: %v", pod.Id, err)) } if err := r.removeTerminationFiles(pod); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up pod TerminationMessageFile %q: %v", pod.Id, err)) } if _, err := r.cli.RunCommand(nil, "rm", pod.Id); err != nil { errlist = append(errlist, fmt.Errorf("rkt: Failed to remove pod %q: %v", pod.Id, err)) } return errors.NewAggregate(errlist) } // rktExitError implements /pkg/util/exec.ExitError interface. type rktExitError struct{ *exec.ExitError } var _ utilexec.ExitError = &rktExitError{} func (r *rktExitError) ExitStatus() int { if status, ok := r.Sys().(syscall.WaitStatus); ok { return status.ExitStatus() } return 0 } func newRktExitError(e error) error { if exitErr, ok := e.(*exec.ExitError); ok { return &rktExitError{exitErr} } return e } func (r *Runtime) AttachContainer(containerID kubecontainer.ContainerID, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize) error { return fmt.Errorf("unimplemented") } // Note: In rkt, the container ID is in the form of "UUID:appName", where UUID is // the rkt UUID, and appName is the container name. // TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail. func (r *Runtime) ExecInContainer(containerID kubecontainer.ContainerID, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error { glog.V(4).Infof("Rkt execing in container.") id, err := parseContainerID(containerID) if err != nil { return err } args := []string{"enter", fmt.Sprintf("--app=%s", id.appName), id.uuid} args = append(args, cmd...) command := buildCommand(r.config, args...) if tty { p, err := kubecontainer.StartPty(command) if err != nil { return err } defer p.Close() // make sure to close the stdout stream defer stdout.Close() kubecontainer.HandleResizing(resize, func(size remotecommand.TerminalSize) { term.SetSize(p.Fd(), size) }) if stdin != nil { go io.Copy(p, stdin) } if stdout != nil { go io.Copy(stdout, p) } return newRktExitError(command.Wait()) } if stdin != nil { // Use an os.Pipe here as it returns true *os.File objects. // This way, if you run 'kubectl exec -i bash' (no tty) and type 'exit', // the call below to command.Run() can unblock because its Stdin is the read half // of the pipe. r, w, err := r.os.Pipe() if err != nil { return newRktExitError(err) } go io.Copy(w, stdin) command.Stdin = r } if stdout != nil { command.Stdout = stdout } if stderr != nil { command.Stderr = stderr } return newRktExitError(command.Run()) } // PortForward executes socat in the pod's network namespace and copies // data between stream (representing the user's local connection on their // computer) and the specified port in the container. // // TODO: // - match cgroups of container // - should we support nsenter + socat on the host? (current impl) // - should we support nsenter + socat in a container, running with elevated privs and --pid=host? // // TODO(yifan): Merge with the same function in dockertools. func (r *Runtime) PortForward(pod *kubecontainer.Pod, port int32, stream io.ReadWriteCloser) error { glog.V(4).Infof("Rkt port forwarding in container.") ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{ Detail: true, Filters: runningKubernetesPodFilters(pod.ID), }) if err != nil { return fmt.Errorf("couldn't list pods: %v", err) } if len(listResp.Pods) != 1 { var podlist []string for _, p := range listResp.Pods { podlist = append(podlist, p.Id) } return fmt.Errorf("more than one running rkt pod for the kubernetes pod [%s]", strings.Join(podlist, ", ")) } listPod := listResp.Pods[0] socatPath, lookupErr := exec.LookPath("socat") if lookupErr != nil { return fmt.Errorf("unable to do port forwarding: socat not found.") } // Check in config and in annotations if we're running kvm flavor isKvm := strings.Contains(r.config.Stage1Image, "kvm") for _, anno := range listPod.Annotations { if anno.Key == k8sRktStage1NameAnno { isKvm = strings.Contains(anno.Value, "kvm") break } } var args []string var fwCaller string if isKvm { podNetworks := listPod.GetNetworks() if podNetworks == nil { return fmt.Errorf("unable to get networks") } args = []string{"-", fmt.Sprintf("TCP4:%s:%d", podNetworks[0].Ipv4, port)} fwCaller = socatPath } else { args = []string{"-t", fmt.Sprintf("%d", listPod.Pid), "-n", socatPath, "-", fmt.Sprintf("TCP4:localhost:%d", port)} nsenterPath, lookupErr := exec.LookPath("nsenter") if lookupErr != nil { return fmt.Errorf("unable to do port forwarding: nsenter not found") } fwCaller = nsenterPath } command := exec.Command(fwCaller, args...) command.Stdout = stream // If we use Stdin, command.Run() won't return until the goroutine that's copying // from stream finishes. Unfortunately, if you have a client like telnet connected // via port forwarding, as long as the user's telnet client is connected to the user's // local listener that port forwarding sets up, the telnet session never exits. This // means that even if socat has finished running, command.Run() won't ever return // (because the client still has the connection and stream open). // // The work around is to use StdinPipe(), as Wait() (called by Run()) closes the pipe // when the command (socat) exits. inPipe, err := command.StdinPipe() if err != nil { return fmt.Errorf("unable to do port forwarding: error creating stdin pipe: %v", err) } go func() { io.Copy(inPipe, stream) inPipe.Close() }() return command.Run() } // UpdatePodCIDR updates the runtimeconfig with the podCIDR. // Currently no-ops, just implemented to satisfy the cri. func (r *Runtime) UpdatePodCIDR(podCIDR string) error { return nil } // appStateToContainerState converts rktapi.AppState to kubecontainer.ContainerState. func appStateToContainerState(state rktapi.AppState) kubecontainer.ContainerState { switch state { case rktapi.AppState_APP_STATE_RUNNING: return kubecontainer.ContainerStateRunning case rktapi.AppState_APP_STATE_EXITED: return kubecontainer.ContainerStateExited } return kubecontainer.ContainerStateUnknown } // getPodInfo returns the pod manifest, creation time and restart count of the pod. func getPodInfo(pod *rktapi.Pod) (podManifest *appcschema.PodManifest, restartCount int, err error) { // TODO(yifan): The manifest is only used for getting the annotations. // Consider to let the server to unmarshal the annotations. var manifest appcschema.PodManifest if err = json.Unmarshal(pod.Manifest, &manifest); err != nil { return } if countString, ok := manifest.Annotations.Get(k8sRktRestartCountAnno); ok { restartCount, err = strconv.Atoi(countString) if err != nil { return } } return &manifest, restartCount, nil } // populateContainerStatus fills the container status according to the app's information. func populateContainerStatus(pod rktapi.Pod, app rktapi.App, runtimeApp appcschema.RuntimeApp, restartCount int, finishedTime time.Time) (*kubecontainer.ContainerStatus, error) { hashStr, ok := runtimeApp.Annotations.Get(k8sRktContainerHashAnno) if !ok { return nil, fmt.Errorf("No container hash in pod manifest") } hashNum, err := strconv.ParseUint(hashStr, 10, 64) if err != nil { return nil, err } var reason, message string if app.State == rktapi.AppState_APP_STATE_EXITED { if app.ExitCode == 0 { reason = "Completed" } else { reason = "Error" } } terminationMessagePath, ok := runtimeApp.Annotations.Get(k8sRktTerminationMessagePathAnno) if ok { if data, err := ioutil.ReadFile(terminationMessagePath); err != nil { message = fmt.Sprintf("Error on reading termination-log %s: %v", terminationMessagePath, err) } else { message = string(data) } } createdTime := time.Unix(0, pod.CreatedAt) startedTime := time.Unix(0, pod.StartedAt) return &kubecontainer.ContainerStatus{ ID: buildContainerID(&containerID{uuid: pod.Id, appName: app.Name}), Name: app.Name, State: appStateToContainerState(app.State), CreatedAt: createdTime, StartedAt: startedTime, FinishedAt: finishedTime, ExitCode: int(app.ExitCode), // By default, the version returned by rkt API service will be "latest" if not specified. Image: fmt.Sprintf("%s:%s", app.Image.Name, app.Image.Version), ImageID: "rkt://" + app.Image.Id, // TODO(yifan): Add the prefix only in v1.PodStatus. Hash: hashNum, // TODO(yifan): Note that now all apps share the same restart count, this might // change once apps don't share the same lifecycle. // See https://github.com/appc/spec/pull/547. RestartCount: restartCount, Reason: reason, Message: message, }, nil } // from a running systemd unit, return the network namespace of a Pod // this field is inside the X-Kubernetes directive func (r *Runtime) getNetworkNamespace(uid kubetypes.UID, latestPod *rktapi.Pod) (networkNamespace kubecontainer.ContainerID, err error) { serviceFiles, err := r.getPodSystemdServiceFiles() if err != nil { return networkNamespace, err } for _, f := range serviceFiles { fileName := f.Name() if latestPod.Id == getRktUUIDFromServiceFileName(fileName) { podService, err := r.unitGetter.getKubernetesDirective(serviceFilePath(fileName)) if err != nil { return networkNamespace, err } return podService.networkNamespace, nil } } return networkNamespace, fmt.Errorf("Pod %q containing rktPod %q haven't find a corresponding NetworkNamespace in %d systemd units", uid, latestPod.Id, len(serviceFiles)) } // GetPodStatus returns the status for a pod specified by a given UID, name, // and namespace. It will attempt to find pod's information via a request to // the rkt api server. // An error will be returned if the api server returns an error. If the api // server doesn't error, but doesn't provide meaningful information about the // pod, a status with no information (other than the passed in arguments) is // returned anyways. func (r *Runtime) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) { podStatus := &kubecontainer.PodStatus{ ID: uid, Name: name, Namespace: namespace, } ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout) defer cancel() listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{ Detail: true, Filters: kubernetesPodFilters(uid), }) if err != nil { return nil, fmt.Errorf("couldn't list pods: %v", err) } var latestPod *rktapi.Pod var latestRestartCount int = -1 // In this loop, we group all containers from all pods together, // also we try to find the latest pod, so we can fill other info of the pod below. for _, pod := range listResp.Pods { manifest, restartCount, err := getPodInfo(pod) if err != nil { glog.Warningf("rkt: Couldn't get necessary info from the rkt pod, (uuid %q): %v", pod.Id, err) continue } if restartCount > latestRestartCount { latestPod = pod latestRestartCount = restartCount } finishedTime := r.podFinishedAt(uid, pod.Id) for i, app := range pod.Apps { // The order of the apps is determined by the rkt pod manifest. cs, err := populateContainerStatus(*pod, *app, manifest.Apps[i], restartCount, finishedTime) if err != nil { glog.Warningf("rkt: Failed to populate container status(uuid %q, app %q): %v", pod.Id, app.Name, err) continue } podStatus.ContainerStatuses = append(podStatus.ContainerStatuses, cs) } } if latestPod == nil { glog.Warningf("No latestPod: rkt api-svc returns [%d]rktPods, cannot fill podStatus.IP", len(listResp.Pods)) return podStatus, nil } // If we are running no-op network plugin, then get the pod IP from the rkt pod status. if r.network.PluginName() == network.DefaultPluginName { for _, n := range latestPod.Networks { if n.Name == defaultNetworkName { podStatus.IP = n.Ipv4 break } } return podStatus, nil } networkNamespace, err := r.unitGetter.getNetworkNamespace(uid, latestPod) if err != nil { glog.Warningf("networkNamespace: %v", err) } status, err := r.network.GetPodNetworkStatus(namespace, name, networkNamespace) if err != nil { glog.Warningf("rkt: %v", err) } else if status != nil { // status can be nil when the pod is running on the host network, // in which case the pod IP will be populated by the upper layer. podStatus.IP = status.IP.String() } return podStatus, nil } // getOSReleaseInfo reads /etc/os-release and returns a map // that contains the key value pairs in that file. func getOSReleaseInfo() (map[string]string, error) { result := make(map[string]string) path := "/etc/os-release" f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { line := scanner.Text() if len(strings.TrimSpace(line)) == 0 { // Skips empty lines continue } info := strings.SplitN(line, "=", 2) if len(info) != 2 { glog.Warningf("Unexpected entry in os-release %q", line) continue } result[info[0]] = info[1] } if err := scanner.Err(); err != nil { return nil, err } return result, nil } // convertKubeMounts creates appc volumes and mount points according to the given mounts. // Only one volume will be created for every unique host path. // Only one mount point will be created for every unique container path. func convertKubeMounts(mounts []kubecontainer.Mount) ([]appctypes.Volume, []appctypes.MountPoint) { volumeMap := make(map[string]*appctypes.Volume) mountPointMap := make(map[string]*appctypes.MountPoint) for _, mnt := range mounts { readOnly := mnt.ReadOnly if _, existed := volumeMap[mnt.HostPath]; !existed { volumeMap[mnt.HostPath] = &appctypes.Volume{ Name: *appctypes.MustACName(string(uuid.NewUUID())), Kind: "host", Source: mnt.HostPath, ReadOnly: &readOnly, } } if _, existed := mountPointMap[mnt.ContainerPath]; existed { glog.Warningf("Multiple mount points with the same container path %v, ignore it", mnt) continue } mountPointMap[mnt.ContainerPath] = &appctypes.MountPoint{ Name: volumeMap[mnt.HostPath].Name, Path: mnt.ContainerPath, ReadOnly: readOnly, } } volumes := make([]appctypes.Volume, 0, len(volumeMap)) mountPoints := make([]appctypes.MountPoint, 0, len(mountPointMap)) for _, vol := range volumeMap { volumes = append(volumes, *vol) } for _, mnt := range mountPointMap { mountPoints = append(mountPoints, *mnt) } return volumes, mountPoints } // convertKubePortMappings creates appc container ports and host ports according to the given port mappings. // The container ports and host ports are mapped by PortMapping.Name. func convertKubePortMappings(portMappings []kubecontainer.PortMapping) ([]appctypes.Port, []appctypes.ExposedPort) { containerPorts := make([]appctypes.Port, 0, len(portMappings)) hostPorts := make([]appctypes.ExposedPort, 0, len(portMappings)) for _, p := range portMappings { // This matches the docker code's behaviour. if p.HostPort == 0 { continue } portName := convertToACName(p.Name) containerPorts = append(containerPorts, appctypes.Port{ Name: portName, Protocol: string(p.Protocol), Port: uint(p.ContainerPort), }) hostPorts = append(hostPorts, appctypes.ExposedPort{ Name: portName, HostPort: uint(p.HostPort), }) } return containerPorts, hostPorts } func newNoNewPrivilegesIsolator(v bool) (*appctypes.Isolator, error) { b := fmt.Sprintf(`{"name": "%s", "value": %t}`, appctypes.LinuxNoNewPrivilegesName, v) i := &appctypes.Isolator{ Name: appctypes.LinuxNoNewPrivilegesName, } if err := i.UnmarshalJSON([]byte(b)); err != nil { return nil, err } return i, nil }