From 7b2aef0d8120033f232e81204616f1d09df708cd Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Thu, 31 Mar 2022 18:59:33 +0530 Subject: [PATCH] util: add support for the nsenter add support to run rbd map and mount -t commands with the nsenter. complete design of pod/multus network is added here https://github.com/rook/rook/ blob/master/design/ceph/multus-network.md#csi-pods Signed-off-by: Madhu Rajanna --- .../templates/nodeplugin-daemonset.yaml | 1 + charts/ceph-csi-cephfs/values.yaml | 1 + charts/ceph-csi-rbd/values.yaml | 1 + .../cephfs/kubernetes/csi-cephfsplugin.yaml | 1 + examples/README.md | 36 ++++++++++ examples/csi-config-map-sample.yaml | 7 ++ internal/cephfs/mounter/fuse.go | 12 +++- internal/cephfs/mounter/kernel.go | 11 ++- internal/cephfs/nodeserver.go | 5 ++ internal/cephfs/store/volumeoptions.go | 2 + internal/rbd/nodeserver.go | 4 ++ internal/rbd/rbd_attach.go | 13 +++- internal/rbd/rbd_util.go | 2 + internal/util/cephcmds.go | 42 ++++++++++++ internal/util/csiconfig.go | 11 +++ internal/util/csiconfig_test.go | 67 +++++++++++++++++++ 16 files changed, 211 insertions(+), 5 deletions(-) diff --git a/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml b/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml index b2244d195..58e61e93d 100644 --- a/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml +++ b/charts/ceph-csi-cephfs/templates/nodeplugin-daemonset.yaml @@ -31,6 +31,7 @@ spec: priorityClassName: {{ .Values.nodeplugin.priorityClassName }} {{- end }} hostNetwork: true + hostPID: true # to use e.g. Rook orchestrated cluster, and mons' FQDN is # resolved through k8s service, set dns policy to cluster first dnsPolicy: ClusterFirstWithHostNet diff --git a/charts/ceph-csi-cephfs/values.yaml b/charts/ceph-csi-cephfs/values.yaml index b8e902fac..73f396f30 100644 --- a/charts/ceph-csi-cephfs/values.yaml +++ b/charts/ceph-csi-cephfs/values.yaml @@ -27,6 +27,7 @@ serviceAccounts: # - "" # cephFS: # subvolumeGroup: "csi" +# netNamespaceFilePath: "{{ .kubeletDir }}/plugins/{{ .driverName }}/net" csiConfig: [] # Set logging level for csi containers. diff --git a/charts/ceph-csi-rbd/values.yaml b/charts/ceph-csi-rbd/values.yaml index a7bd91765..b6e6e55e4 100644 --- a/charts/ceph-csi-rbd/values.yaml +++ b/charts/ceph-csi-rbd/values.yaml @@ -25,6 +25,7 @@ serviceAccounts: # monitors: # - "" # - "" +# netNamespaceFilePath: "{{ .kubeletDir }}/plugins/{{ .driverName }}/net" csiConfig: [] # Configuration details of clusterID,PoolID and FscID mapping diff --git a/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml b/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml index 5e1d0f9d6..e8e7f1cf2 100644 --- a/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml +++ b/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml @@ -15,6 +15,7 @@ spec: serviceAccountName: cephfs-csi-nodeplugin priorityClassName: system-node-critical hostNetwork: true + hostPID: true # to use e.g. Rook orchestrated cluster, and mons' FQDN is # resolved through k8s service, set dns policy to cluster first dnsPolicy: ClusterFirstWithHostNet diff --git a/examples/README.md b/examples/README.md index 674e26c57..fe14b3435 100644 --- a/examples/README.md +++ b/examples/README.md @@ -50,6 +50,42 @@ kubectl replace -f ./csi-config-map-sample.yaml Storage class and snapshot class, using `` as the value for the option `clusterID`, can now be created on the cluster. +## Running CephCSI with pod networking + +The current problem with Pod Networking, is when a CephFS/RBD volume is mounted +in a pod using Ceph CSI and then the CSI CephFS/RBD plugin is restarted or +terminated (e.g. by restarting or deleting its DaemonSet), all operations on +the volume become blocked, even after restarting the CSI pods. + +The only workaround is to restart the node where the Ceph CSI plugin pod was +restarted. This can be mitigated by running the `rbd map`/`mount -t` commands +in a different network namespace which does not get deleted when the CSI +CephFS/RBD plugin is restarted or terminated. + +If someone wants to run the CephCSI with the pod networking they can still do +by setting the `netNamespaceFilePath`. If this path is set CephCSI will execute +the `rbd map`/`mount -t` commands after entering the [network +namespace](https://man7.org/linux/man-pages/man7/network_namespaces.7.html) +specified by `netNamespaceFilePath` with the +[nsenter](https://man7.org/linux/man-pages/man1/nsenter.1.html) command. + +`netNamespaceFilePath` should point to the network namespace of some +long-running process, typically it would be a symlink to +`/proc//ns/net`. + +The long-running process can also be another pod which is a Daemonset which +never restarts. This Pod should only be stopped and restarted when a node is +stopped so that volume operations do not become blocked. The new DaemonSet pod +can contain a single container, responsible for holding its pod network alive. +It is used as a passthrough by the CephCSI plugin pod which when mounting or +mapping will use the network namespace of this pod. + +Once the pod is created get its PID and create a symlink to +`/proc//ns/net` in the hostPath volume shared with the csi-plugin pod and +specify the path in the `netNamespaceFilePath` option. + +*Note* This Pod should have `hostPID: true` in the Pod Spec. + ## Deploying the storage class Once the plugin is successfully deployed, you'll need to customize diff --git a/examples/csi-config-map-sample.yaml b/examples/csi-config-map-sample.yaml index 581fbd108..eddab8f9a 100644 --- a/examples/csi-config-map-sample.yaml +++ b/examples/csi-config-map-sample.yaml @@ -20,6 +20,12 @@ kind: ConfigMap # NOTE: Make sure you don't add radosNamespace option to a currently in use # configuration as it will cause issues. # The field "cephFS.subvolumeGroup" is optional and defaults to "csi". +# The fields are the various network namespace +# path for the Ceph cluster identified by the , This will be used +# by the CSI plugin to execute the rbd map/unmap and mount -t commands in the +# network namespace specified by the . +# If a CSI plugin is using more than one Ceph cluster, repeat the section for +# each such cluster in use. # NOTE: Changes to the configmap is automatically updated in the running pods, # thus restarting existing pods using the configmap is NOT required on edits # to the configmap. @@ -37,6 +43,7 @@ data: { "clusterID": "", "radosNamespace": "", + "netNamespaceFilePath": "/plugins/rbd.csi.ceph.com/net", "monitors": [ "", "", diff --git a/internal/cephfs/mounter/fuse.go b/internal/cephfs/mounter/fuse.go index 91077398b..2711db5ee 100644 --- a/internal/cephfs/mounter/fuse.go +++ b/internal/cephfs/mounter/fuse.go @@ -65,12 +65,20 @@ func mountFuse(ctx context.Context, mountPoint string, cr *util.Credentials, vol if volOptions.FsName != "" { args = append(args, "--client_mds_namespace="+volOptions.FsName) } + var ( + stderr string + err error + ) + + if volOptions.NetNamespaceFilePath != "" { + _, stderr, err = util.ExecuteCommandWithNSEnter(ctx, volOptions.NetNamespaceFilePath, "ceph-fuse", args[:]...) + } else { + _, stderr, err = util.ExecCommand(ctx, "ceph-fuse", args[:]...) + } - _, stderr, err := util.ExecCommand(ctx, "ceph-fuse", args[:]...) if err != nil { return fmt.Errorf("%w stderr: %s", err, stderr) } - // Parse the output: // We need "starting fuse" meaning the mount is ok // and PID of the ceph-fuse daemon for unmount diff --git a/internal/cephfs/mounter/kernel.go b/internal/cephfs/mounter/kernel.go index ff515be28..b7ae67ff8 100644 --- a/internal/cephfs/mounter/kernel.go +++ b/internal/cephfs/mounter/kernel.go @@ -51,7 +51,16 @@ func mountKernel(ctx context.Context, mountPoint string, cr *util.Credentials, v args = append(args, "-o", optionsStr) - _, stderr, err := util.ExecCommand(ctx, "mount", args[:]...) + var ( + stderr string + err error + ) + + if volOptions.NetNamespaceFilePath != "" { + _, stderr, err = util.ExecuteCommandWithNSEnter(ctx, volOptions.NetNamespaceFilePath, "mount", args[:]...) + } else { + _, stderr, err = util.ExecCommand(ctx, "mount", args[:]...) + } if err != nil { return fmt.Errorf("%w stderr: %s", err, stderr) } diff --git a/internal/cephfs/nodeserver.go b/internal/cephfs/nodeserver.go index 21457ff38..b7c10db55 100644 --- a/internal/cephfs/nodeserver.go +++ b/internal/cephfs/nodeserver.go @@ -126,6 +126,11 @@ func (ns *NodeServer) NodeStageVolume( } defer volOptions.Destroy() + volOptions.NetNamespaceFilePath, err = util.GetNetNamespaceFilePath(util.CsiConfigFile, volOptions.ClusterID) + if err != nil { + return nil, status.Error(codes.Internal, err.Error()) + } + mnt, err := mounter.New(volOptions) if err != nil { log.ErrorLog(ctx, "failed to create mounter for volume %s: %v", volID, err) diff --git a/internal/cephfs/store/volumeoptions.go b/internal/cephfs/store/volumeoptions.go index 44706f61f..0f1a35e3b 100644 --- a/internal/cephfs/store/volumeoptions.go +++ b/internal/cephfs/store/volumeoptions.go @@ -50,6 +50,8 @@ type VolumeOptions struct { ProvisionVolume bool `json:"provisionVolume"` KernelMountOptions string `json:"kernelMountOptions"` FuseMountOptions string `json:"fuseMountOptions"` + // Network namespace file path to execute nsenter command + NetNamespaceFilePath string // conn is a connection to the Ceph cluster obtained from a ConnPool conn *util.ClusterConnection diff --git a/internal/rbd/nodeserver.go b/internal/rbd/nodeserver.go index 143a31ef7..579f2c211 100644 --- a/internal/rbd/nodeserver.go +++ b/internal/rbd/nodeserver.go @@ -332,6 +332,10 @@ func (ns *NodeServer) NodeStageVolume( } defer rv.Destroy() + rv.NetNamespaceFilePath, err = util.GetNetNamespaceFilePath(util.CsiConfigFile, rv.ClusterID) + if err != nil { + return nil, status.Error(codes.Internal, err.Error()) + } if isHealer { err = healerStageTransaction(ctx, cr, rv, stagingParentPath) if err != nil { diff --git a/internal/rbd/rbd_attach.go b/internal/rbd/rbd_attach.go index 1c872ba44..e2cddef50 100644 --- a/internal/rbd/rbd_attach.go +++ b/internal/rbd/rbd_attach.go @@ -457,8 +457,17 @@ func createPath(ctx context.Context, volOpt *rbdVolume, device string, cr *util. mapArgs = append(mapArgs, "--read-only") } - // Execute map - stdout, stderr, err := util.ExecCommand(ctx, cli, mapArgs...) + var ( + stdout string + stderr string + err error + ) + + if volOpt.NetNamespaceFilePath != "" { + stdout, stderr, err = util.ExecuteCommandWithNSEnter(ctx, volOpt.NetNamespaceFilePath, cli, mapArgs...) + } else { + stdout, stderr, err = util.ExecCommand(ctx, cli, mapArgs...) + } if err != nil { log.WarningLog(ctx, "rbd: map error %v, rbd output: %s", err, stderr) // unmap rbd image if connection timeout diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index 3afe68962..4d4986ae2 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -156,6 +156,8 @@ type rbdVolume struct { LogStrategy string VolName string MonValueFromSecret string + // Network namespace file path to execute nsenter command + NetNamespaceFilePath string // RequestedVolSize has the size of the volume requested by the user and // this value will not be updated when doing getImageInfo() on rbdVolume. RequestedVolSize int64 diff --git a/internal/util/cephcmds.go b/internal/util/cephcmds.go index 66c32b196..71999e98d 100644 --- a/internal/util/cephcmds.go +++ b/internal/util/cephcmds.go @@ -21,6 +21,7 @@ import ( "context" "errors" "fmt" + "os" "os/exec" "time" @@ -32,6 +33,47 @@ import ( // InvalidPoolID used to denote an invalid pool. const InvalidPoolID int64 = -1 +// ExecuteCommandWithNSEnter executes passed in program with args with nsenter +// and returns separate stdout and stderr streams. In case ctx is not set to +// context.TODO(), the command will be logged after it was executed. +func ExecuteCommandWithNSEnter(ctx context.Context, netPath, program string, args ...string) (string, string, error) { + var ( + sanitizedArgs = StripSecretInArgs(args) + stdoutBuf bytes.Buffer + stderrBuf bytes.Buffer + ) + + // check netPath exists + if _, err := os.Stat(netPath); err != nil { + return "", "", fmt.Errorf("failed to get stat for %s %w", netPath, err) + } + // nsenter --net=%s -- + args = append([]string{fmt.Sprintf("--net=%s", netPath), "--", program}, args...) + cmd := exec.Command("nsenter", args...) // #nosec:G204, commands executing not vulnerable. + + cmd.Stdout = &stdoutBuf + cmd.Stderr = &stderrBuf + + err := cmd.Run() + stdout := stdoutBuf.String() + stderr := stderrBuf.String() + + if err != nil { + err = fmt.Errorf("an error (%w) occurred while running %s args: %v", err, program, sanitizedArgs) + if ctx != context.TODO() { + log.UsefulLog(ctx, "%s", err) + } + + return stdout, stderr, err + } + + if ctx != context.TODO() { + log.UsefulLog(ctx, "command succeeded: %s %v", program, sanitizedArgs) + } + + return stdout, stderr, nil +} + // ExecCommand executes passed in program with args and returns separate stdout // and stderr streams. In case ctx is not set to context.TODO(), the command // will be logged after it was executed. diff --git a/internal/util/csiconfig.go b/internal/util/csiconfig.go index 97076cde6..a6e3dd534 100644 --- a/internal/util/csiconfig.go +++ b/internal/util/csiconfig.go @@ -49,6 +49,8 @@ type ClusterInfo struct { // SubvolumeGroup contains the name of the SubvolumeGroup for CSI volumes SubvolumeGroup string `json:"subvolumeGroup"` } `json:"cephFS"` + // symlink filepath for the network namespace where we need to execute commands. + NetNamespaceFilePath string `json:"netNamespaceFilePath"` } // Expected JSON structure in the passed in config file is, @@ -161,3 +163,12 @@ func GetClusterID(options map[string]string) (string, error) { return clusterID, nil } + +func GetNetNamespaceFilePath(pathToConfig, clusterID string) (string, error) { + cluster, err := readClusterInfo(pathToConfig, clusterID) + if err != nil { + return "", err + } + + return cluster.NetNamespaceFilePath, nil +} diff --git a/internal/util/csiconfig_test.go b/internal/util/csiconfig_test.go index fdcd24038..22ceadef2 100644 --- a/internal/util/csiconfig_test.go +++ b/internal/util/csiconfig_test.go @@ -17,6 +17,7 @@ limitations under the License. package util import ( + "encoding/json" "os" "testing" ) @@ -138,3 +139,69 @@ func TestCSIConfig(t *testing.T) { t.Errorf("Test setup error %s", err) } } + +func TestGetNetNamespaceFilePath(t *testing.T) { + t.Parallel() + tests := []struct { + name string + clusterID string + want string + }{ + { + name: "get NetNamespaceFilePath for cluster-1", + clusterID: "cluster-1", + want: "/var/lib/kubelet/plugins/rbd.ceph.csi.com/cluster1-net", + }, + { + name: "get NetNamespaceFilePath for cluster-2", + clusterID: "cluster-2", + want: "/var/lib/kubelet/plugins/rbd.ceph.csi.com/cluster2-net", + }, + { + name: "when NetNamespaceFilePath is empty", + clusterID: "cluster-3", + want: "", + }, + } + + csiConfig := []ClusterInfo{ + { + ClusterID: "cluster-1", + Monitors: []string{"ip-1", "ip-2"}, + NetNamespaceFilePath: "/var/lib/kubelet/plugins/rbd.ceph.csi.com/cluster1-net", + }, + { + ClusterID: "cluster-2", + Monitors: []string{"ip-3", "ip-4"}, + NetNamespaceFilePath: "/var/lib/kubelet/plugins/rbd.ceph.csi.com/cluster2-net", + }, + { + ClusterID: "cluster-3", + Monitors: []string{"ip-5", "ip-6"}, + }, + } + csiConfigFileContent, err := json.Marshal(csiConfig) + if err != nil { + t.Errorf("failed to marshal csi config info %v", err) + } + tmpConfPath := t.TempDir() + "/ceph-csi.json" + err = os.WriteFile(tmpConfPath, csiConfigFileContent, 0o600) + if err != nil { + t.Errorf("failed to write %s file content: %v", CsiConfigFile, err) + } + for _, tt := range tests { + ts := tt + t.Run(ts.name, func(t *testing.T) { + t.Parallel() + got, err := GetNetNamespaceFilePath(tmpConfPath, ts.clusterID) + if err != nil { + t.Errorf("GetNetNamespaceFilePath() error = %v", err) + + return + } + if got != ts.want { + t.Errorf("GetNetNamespaceFilePath() = %v, want %v", got, ts.want) + } + }) + } +}