From 6ec1196f478ebc07dadc87c351b411f4e028de7f Mon Sep 17 00:00:00 2001 From: j-griffith Date: Wed, 13 Mar 2019 18:18:04 -0600 Subject: [PATCH] Rework multi-node-multi-writer feature This commit reverts the initial implementation of the multi-node-multi-writer feature: commit: b5b8e4646094d0ec1f3dfe96060a183c863d7d1a It replaces that implementation with a more restrictive version that only allows multi-node-multi-writer for volumes of type `block` With this change there are no volume parameters required in the stoarge class, we also fail any attempt to create a file based device with multi-node-multi-write being specified, this way a user doesn't have to wait until they try and do the publish before realizing it doesn't work. --- Makefile | 2 +- examples/README.md | 99 +++++++++++++++++++++++++++++++++++++ pkg/rbd/controllerserver.go | 20 +++++++- pkg/rbd/nodeserver.go | 15 +++++- pkg/rbd/rbd.go | 9 +++- pkg/rbd/rbd_attach.go | 5 ++ pkg/rbd/rbd_util.go | 7 ++- 7 files changed, 152 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 734a761b8..825ed8b66 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ go-test: ./scripts/test-go.sh static-check: - ./scripts/lint-go.sh + ./scripts/lint-go.sh ./scripts/lint-text.sh rbdplugin: diff --git a/examples/README.md b/examples/README.md index d309cdcaf..d27f9777a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -114,3 +114,102 @@ To restore the snapshot to a new PVC, deploy kubectl create -f pvc-restore.yaml kubectl create -f pod-restore.yaml ``` + +## How to test RBD MULTI_NODE_MULTI_WRITER BLOCK feature + +Requires feature-gates: `BlockVolume=true` `CSIBlockVolume=true` + +*NOTE* The MULTI_NODE_MULTI_WRITER capability is only available for +Volumes that are of access_type `block` + +*WARNING* This feature is strictly for workloads that know how to deal +with concurrent access to the Volume (eg Active/Passive applications). +Using RWX modes on non clustered file systems with applications trying +to simultaneously access the Volume will likely result in data corruption! + +Following are examples for issuing a request for a `Block` +`ReadWriteMany` Claim, and using the resultant Claim for a POD + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: block-pvc +spec: + accessModes: + - ReadWriteMany + volumeMode: Block + resources: + requests: + storage: 1Gi + storageClassName: csi-rbd +``` + +Create a POD that uses this PVC: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: my-pod +spec: + containers: + - name: my-container + image: debian + command: ["/bin/bash", "-c"] + args: [ "tail -f /dev/null" ] + volumeDevices: + - devicePath: /dev/rbdblock + name: my-volume + imagePullPolicy: IfNotPresent + volumes: + - name: my-volume + persistentVolumeClaim: + claimName: block-pvc + +``` + +Now, we can create a second POD (ensure the POD is scheduled on a different +node; multiwriter single node works without this feature) that also uses this +PVC at the same time, again wait for the pod to enter running state, and verify +the block device is available. + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: another-pod +spec: + containers: + - name: my-container + image: debian + command: ["/bin/bash", "-c"] + args: [ "tail -f /dev/null" ] + volumeDevices: + - devicePath: /dev/rbdblock + name: my-volume + imagePullPolicy: IfNotPresent + volumes: + - name: my-volume + persistentVolumeClaim: + claimName: block-pvc +``` + +Wait for the PODs to enter Running state, check that our block device +is available in the container at `/dev/rdbblock` in both containers: + +```bash +$ kubectl exec -it my-pod -- fdisk -l /dev/rbdblock +Disk /dev/rbdblock: 1 GiB, 1073741824 bytes, 2097152 sectors +Units: sectors of 1 * 512 = 512 bytes +Sector size (logical/physical): 512 bytes / 512 bytes +I/O size (minimum/optimal): 4194304 bytes / 4194304 bytes +``` + +```bash +$ kubectl exec -it another-pod -- fdisk -l /dev/rbdblock +Disk /dev/rbdblock: 1 GiB, 1073741824 bytes, 2097152 sectors +Units: sectors of 1 * 512 = 512 bytes +Sector size (logical/physical): 512 bytes / 512 bytes +I/O size (minimum/optimal): 4194304 bytes / 4194304 bytes +``` diff --git a/pkg/rbd/controllerserver.go b/pkg/rbd/controllerserver.go index ba7cb9092..49af63206 100644 --- a/pkg/rbd/controllerserver.go +++ b/pkg/rbd/controllerserver.go @@ -93,7 +93,25 @@ func (cs *ControllerServer) validateVolumeReq(req *csi.CreateVolumeRequest) erro func parseVolCreateRequest(req *csi.CreateVolumeRequest) (*rbdVolume, error) { // TODO (sbezverk) Last check for not exceeding total storage capacity - rbdVol, err := getRBDVolumeOptions(req.GetParameters()) + isMultiNode := false + isBlock := false + for _, cap := range req.VolumeCapabilities { + // Only checking SINGLE_NODE_SINGLE_WRITER here because regardless of the other types (MULTI READER) we need to implement the same logic to ignore the in-use response + if cap.GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { + isMultiNode = true + } + if cap.GetBlock() != nil { + isBlock = true + } + } + + // We want to fail early if the user is trying to create a RWX on a non-block type device + if isMultiNode && !isBlock { + return nil, status.Error(codes.InvalidArgument, "multi node access modes are only supported on rbd `block` type volumes") + } + + // if it's NOT SINGLE_NODE_WRITER and it's BLOCK we'll set the parameter to ignore the in-use checks + rbdVol, err := getRBDVolumeOptions(req.GetParameters(), (isMultiNode && isBlock)) if err != nil { return nil, status.Error(codes.InvalidArgument, err.Error()) } diff --git a/pkg/rbd/nodeserver.go b/pkg/rbd/nodeserver.go index 21d7ae829..3a88b456e 100644 --- a/pkg/rbd/nodeserver.go +++ b/pkg/rbd/nodeserver.go @@ -48,6 +48,7 @@ type NodeServer struct { func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) { targetPath := req.GetTargetPath() targetPathMutex.LockKey(targetPath) + disableInUseChecks := false defer func() { if err := targetPathMutex.UnlockKey(targetPath); err != nil { @@ -70,7 +71,19 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis if !notMnt { return &csi.NodePublishVolumeResponse{}, nil } - volOptions, err := getRBDVolumeOptions(req.GetVolumeContext()) + + // MULTI_NODE_MULTI_WRITER is supported by default for Block access type volumes + if req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER { + if isBlock { + disableInUseChecks = true + } else { + klog.Warningf("MULTI_NODE_MULTI_WRITER currently only supported with volumes of access type `block`, invalid AccessMode for volume: %v", req.GetVolumeId()) + e := fmt.Errorf("rbd: MULTI_NODE_MULTI_WRITER access mode only allowed with BLOCK access type") + return nil, status.Error(codes.InvalidArgument, e.Error()) + } + } + + volOptions, err := getRBDVolumeOptions(req.GetVolumeContext(), disableInUseChecks) if err != nil { return nil, err } diff --git a/pkg/rbd/rbd.go b/pkg/rbd/rbd.go index 73911aec4..3fab59e01 100644 --- a/pkg/rbd/rbd.go +++ b/pkg/rbd/rbd.go @@ -102,7 +102,14 @@ func (r *Driver) Run(driverName, nodeID, endpoint string, containerized bool, ca csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS, csi.ControllerServiceCapability_RPC_CLONE_VOLUME, }) - r.cd.AddVolumeCapabilityAccessModes([]csi.VolumeCapability_AccessMode_Mode{csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER}) + + // We only support the multi-writer option when using block, but it's a supported capability for the plugin in general + // In addition, we want to add the remaining modes like MULTI_NODE_READER_ONLY, + // MULTI_NODE_SINGLE_WRITER etc, but need to do some verification of RO modes first + // will work those as follow up features + r.cd.AddVolumeCapabilityAccessModes( + []csi.VolumeCapability_AccessMode_Mode{csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER, + csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER}) // Create GRPC servers r.ids = NewIdentityServer(r.cd) diff --git a/pkg/rbd/rbd_attach.go b/pkg/rbd/rbd_attach.go index 354554d12..613cfa765 100644 --- a/pkg/rbd/rbd_attach.go +++ b/pkg/rbd/rbd_attach.go @@ -258,6 +258,7 @@ func attachRBDImage(volOptions *rbdVolume, userID string, credentials map[string Factor: rbdImageWatcherFactor, Steps: rbdImageWatcherSteps, } + err = waitForrbdImage(backoff, volOptions, userID, credentials) if err != nil { @@ -313,6 +314,10 @@ func waitForrbdImage(backoff wait.Backoff, volOptions *rbdVolume, userID string, if err != nil { return false, fmt.Errorf("fail to check rbd image status with: (%v), rbd output: (%s)", err, rbdOutput) } + if (volOptions.DisableInUseChecks) && (used) { + klog.V(2).Info("valid multi-node attach requested, ignoring watcher in-use result") + return used, nil + } return !used, nil }) // return error if rbd image has not become available for the specified timeout diff --git a/pkg/rbd/rbd_util.go b/pkg/rbd/rbd_util.go index 5f8215496..aa5b19f79 100644 --- a/pkg/rbd/rbd_util.go +++ b/pkg/rbd/rbd_util.go @@ -51,6 +51,7 @@ type rbdVolume struct { AdminID string `json:"adminId"` UserID string `json:"userId"` Mounter string `json:"mounter"` + DisableInUseChecks bool `json:"disableInUseChecks"` } type rbdSnapshot struct { @@ -226,7 +227,7 @@ func execCommand(command string, args []string) ([]byte, error) { return cmd.CombinedOutput() } -func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { +func getRBDVolumeOptions(volOptions map[string]string, disableInUseChecks bool) (*rbdVolume, error) { var ok bool rbdVol := &rbdVolume{} rbdVol.Pool, ok = volOptions["pool"] @@ -259,6 +260,10 @@ func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { } } + + klog.V(3).Infof("setting disableInUseChecks on rbd volume to: %v", disableInUseChecks) + rbdVol.DisableInUseChecks = disableInUseChecks + getCredsFromVol(rbdVol, volOptions) return rbdVol, nil }