diff --git a/Makefile b/Makefile index 734a761b8..825ed8b66 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ go-test: ./scripts/test-go.sh static-check: - ./scripts/lint-go.sh + ./scripts/lint-go.sh ./scripts/lint-text.sh rbdplugin: diff --git a/docs/deploy-rbd.md b/docs/deploy-rbd.md index bfaa46bb9..acfde7d46 100644 --- a/docs/deploy-rbd.md +++ b/docs/deploy-rbd.md @@ -58,6 +58,21 @@ Parameter | Required | Description `csi.storage.k8s.io/provisioner-secret-name`, `csi.storage.k8s.io/node-publish-secret-name` | for Kubernetes | name of the Kubernetes Secret object containing Ceph client credentials. Both parameters should have the same value `csi.storage.k8s.io/provisioner-secret-namespace`, `csi.storage.k8s.io/node-publish-secret-namespace` | for Kubernetes | namespaces of the above Secret objects `mounter`| no | if set to `rbd-nbd`, use `rbd-nbd` on nodes that have `rbd-nbd` and `nbd` kernel modules to map rbd images +`fsType` | no | allows setting to `ext3 | ext-4 | xfs`, default is `ext-4` +`multiNodeWritable` | no | if set to `enabled` allows RBD volumes with MultiNode Access Modes to bypass watcher checks. By default multiple attachments of an RBD volume are NOT allowed. Even if this option is set in the StorageClass, it's ignored if a standard SingleNodeWriter Access Mode is requested + +**Warning for multiNodeWritable:** + +*NOTE* the `multiNodeWritable` setting is NOT safe for use by workloads +that are not designed to coordinate access. This does NOT add any sort +of a clustered filesystem or write syncronization, it's specifically for +special workloads that handle access coordination on their own +(ie Active/Passive scenarios). + +Using this mode for general purposes *WILL RESULT IN DATA CORRUPTION*. +We attempt to limit exposure to trouble here but ignoring the Storage Class +setting unless your Volume explicitly asks for multi node access, and assume +you know what you're doing. **Required secrets:** diff --git a/examples/README.md b/examples/README.md index d309cdcaf..94b239b26 100644 --- a/examples/README.md +++ b/examples/README.md @@ -114,3 +114,105 @@ To restore the snapshot to a new PVC, deploy kubectl create -f pvc-restore.yaml kubectl create -f pod-restore.yaml ``` + +## How to enable multi node attach support for RBD + +*WARNING* This feature is strictly for workloads that know how to deal +with concurrent acces to the Volume (eg Active/Passive applications). +Using RWX modes on non clustered file systems with applications trying +to simultaneously access the Volume will likely result in data corruption! + +### Example process to test the multiNodeWritable feature + +Modify your current storage class, or create a new storage class specifically +for multi node writers by adding the `multiNodeWritable: "enabled"` entry to +your parameters. Here's an example: + +``` +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-rbd +provisioner: csi-rbdplugin +parameters: + monitors: rook-ceph-mon-b.rook-ceph.svc.cluster.local:6789 + pool: rbd + imageFormat: "2" + imageFeatures: layering + csiProvisionerSecretName: csi-rbd-secret + csiProvisionerSecretNamespace: default + csiNodePublishSecretName: csi-rbd-secret + csiNodePublishSecretNamespace: default + adminid: admin + userid: admin + fsType: xfs + multiNodeWritable: "enabled" +reclaimPolicy: Delete +``` + +Now, you can request Claims from the configured storage class that include +the `ReadWriteMany` access mode: + +``` +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pvc-1 +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi + storageClassName: csi-rbd +``` + +Create a POD that uses this PVC: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: test-1 +spec: + containers: + - name: web-server + image: nginx + volumeMounts: + - name: mypvc + mountPath: /var/lib/www/html + volumes: + - name: mypvc + persistentVolumeClaim: + claimName: pvc-1 + readOnly: false +``` + +Wait for the POD to enter Running state, write some data to +`/var/lib/www/html` + +Now, we can create a second POD (ensure the POD is scheduled on a different +node; multiwriter single node works without this feature) that also uses this +PVC at the same time + +``` +apiVersion: v1 +kind: Pod +metadata: + name: test-2 +spec: + containers: + - name: web-server + image: nginx + volumeMounts: + - name: mypvc + mountPath: /var/lib/www/html + volumes: + - name: mypvc + persistentVolumeClaim: + claimName: pvc-1 + readOnly: false +``` + +If you access the pod you can check that your data is avaialable at +`/var/lib/www/html` diff --git a/examples/rbd/storageclass.yaml b/examples/rbd/storageclass.yaml index 320a489a8..f7de85f61 100644 --- a/examples/rbd/storageclass.yaml +++ b/examples/rbd/storageclass.yaml @@ -35,4 +35,7 @@ parameters: userid: kubernetes # uncomment the following to use rbd-nbd as mounter on supported nodes # mounter: rbd-nbd + # fsType: xfs + # uncomment the following line to enable multi-attach on RBD volumes + # multiNodeWritable: enabled reclaimPolicy: Delete diff --git a/pkg/rbd/controllerserver.go b/pkg/rbd/controllerserver.go index f5eb1400a..598451f9e 100644 --- a/pkg/rbd/controllerserver.go +++ b/pkg/rbd/controllerserver.go @@ -21,6 +21,7 @@ import ( "os/exec" "sort" "strconv" + "strings" "syscall" "github.com/ceph/ceph-csi/pkg/csi-common" @@ -92,7 +93,16 @@ func (cs *ControllerServer) validateVolumeReq(req *csi.CreateVolumeRequest) erro func parseVolCreateRequest(req *csi.CreateVolumeRequest) (*rbdVolume, error) { // TODO (sbezverk) Last check for not exceeding total storage capacity - rbdVol, err := getRBDVolumeOptions(req.GetParameters()) + // MultiNodeWriters are accepted but they're only for special cases, and we skip the watcher checks for them which isn't the greatest + // let's make sure we ONLY skip that if the user is requesting a MULTI Node accessible mode + disableMultiWriter := true + for _, am := range req.VolumeCapabilities { + if am.GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { + disableMultiWriter = false + } + } + + rbdVol, err := getRBDVolumeOptions(req.GetParameters(), disableMultiWriter) if err != nil { return nil, err } @@ -330,11 +340,20 @@ func (cs *ControllerServer) ListVolumes(ctx context.Context, req *csi.ListVolume // ValidateVolumeCapabilities checks whether the volume capabilities requested // are supported. func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req *csi.ValidateVolumeCapabilitiesRequest) (*csi.ValidateVolumeCapabilitiesResponse, error) { - for _, cap := range req.VolumeCapabilities { - if cap.GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { - return &csi.ValidateVolumeCapabilitiesResponse{Message: ""}, nil + params := req.GetParameters() + multiWriter := params["multiNodeWritable"] + if strings.ToLower(multiWriter) == "enabled" { + klog.V(3).Info("detected multiNodeWritable parameter in Storage Class, allowing multi-node access modes") + + } else { + for _, cap := range req.VolumeCapabilities { + if cap.GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { + return &csi.ValidateVolumeCapabilitiesResponse{Message: ""}, nil + } } + } + return &csi.ValidateVolumeCapabilitiesResponse{ Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{ VolumeCapabilities: req.VolumeCapabilities, diff --git a/pkg/rbd/nodeserver.go b/pkg/rbd/nodeserver.go index 21d7ae829..6f6160a4b 100644 --- a/pkg/rbd/nodeserver.go +++ b/pkg/rbd/nodeserver.go @@ -70,10 +70,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis if !notMnt { return &csi.NodePublishVolumeResponse{}, nil } - volOptions, err := getRBDVolumeOptions(req.GetVolumeContext()) + + ignoreMultiWriterEnabled := true + if req.VolumeCapability.AccessMode.Mode != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { + ignoreMultiWriterEnabled = false + } + + volOptions, err := getRBDVolumeOptions(req.GetVolumeContext(), ignoreMultiWriterEnabled) if err != nil { return nil, err } + // Check access mode settings in the request, even if SC is RW-Many, if the request is a normal Single Writer volume, we ignore this setting and proceed as normal + volOptions.VolName = volName // Mapping RBD image devicePath, err := attachRBDImage(volOptions, volOptions.UserID, req.GetSecrets()) diff --git a/pkg/rbd/rbd.go b/pkg/rbd/rbd.go index 73911aec4..c7b2eab89 100644 --- a/pkg/rbd/rbd.go +++ b/pkg/rbd/rbd.go @@ -102,7 +102,12 @@ func (r *Driver) Run(driverName, nodeID, endpoint string, containerized bool, ca csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS, csi.ControllerServiceCapability_RPC_CLONE_VOLUME, }) - r.cd.AddVolumeCapabilityAccessModes([]csi.VolumeCapability_AccessMode_Mode{csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER}) + + // TODO: JDG Should also look at remaining modes like MULT_NODE_READER (SINGLE_READER) + r.cd.AddVolumeCapabilityAccessModes( + []csi.VolumeCapability_AccessMode_Mode{ + csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER, + csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER}) // Create GRPC servers r.ids = NewIdentityServer(r.cd) diff --git a/pkg/rbd/rbd_attach.go b/pkg/rbd/rbd_attach.go index 354554d12..88834757b 100644 --- a/pkg/rbd/rbd_attach.go +++ b/pkg/rbd/rbd_attach.go @@ -313,8 +313,16 @@ func waitForrbdImage(backoff wait.Backoff, volOptions *rbdVolume, userID string, if err != nil { return false, fmt.Errorf("fail to check rbd image status with: (%v), rbd output: (%s)", err, rbdOutput) } + // In the case of multiattach we want to short circuit the retries when used (so r`if used; return used`) + // otherwise we're setting this to false which translates to !ok, which means backoff and try again + // NOTE: we ONLY do this if an multi-node access mode is requested for this volume + if (strings.ToLower(volOptions.MultiNodeWritable) == "enabled") && (used) { + klog.V(2).Info("detected MultiNodeWritable enabled, ignoring watcher in-use result") + return used, nil + } return !used, nil }) + // return error if rbd image has not become available for the specified timeout if err == wait.ErrWaitTimeout { return fmt.Errorf("rbd image %s is still being used", imagePath) diff --git a/pkg/rbd/rbd_util.go b/pkg/rbd/rbd_util.go index 8c1bd16eb..36f655ec5 100644 --- a/pkg/rbd/rbd_util.go +++ b/pkg/rbd/rbd_util.go @@ -51,6 +51,7 @@ type rbdVolume struct { AdminID string `json:"adminId"` UserID string `json:"userId"` Mounter string `json:"mounter"` + MultiNodeWritable string `json:"multiNodeWritable"` } type rbdSnapshot struct { @@ -226,7 +227,7 @@ func execCommand(command string, args []string) ([]byte, error) { return cmd.CombinedOutput() } -func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { +func getRBDVolumeOptions(volOptions map[string]string, ignoreMultiNodeWritable bool) (*rbdVolume, error) { var ok bool rbdVol := &rbdVolume{} rbdVol.Pool, ok = volOptions["pool"] @@ -260,6 +261,12 @@ func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { } getCredsFromVol(rbdVol, volOptions) + + klog.V(3).Infof("ignoreMultiNodeWritable flag in parse getRBDVolumeOptions is: %v", ignoreMultiNodeWritable) + // If the volume we're working with is NOT requesting multi-node attach then don't treat it special, ignore the setting in the SC and just keep our watcher checks + if !ignoreMultiNodeWritable { + rbdVol.MultiNodeWritable = volOptions["multiNodeWritable"] + } return rbdVol, nil }