From 3acaa018dbb971df40b29a848eba1ca0c0420299 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 22 Jun 2022 11:18:10 +0530 Subject: [PATCH] rbd: issue resync only if the force flag is set During failover we do demote the volume on the primary as the image is still not promoted yet on the remote cluster, there are spurious split-brain errors reported by RBD, the Cephcsi resync will attempt to resync from the "known" secondary and that will cause data loss Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 38 +++++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index f84b0c7a1..ca6935ce0 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -856,18 +856,11 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, ready = checkRemoteSiteStatus(ctx, mirrorStatus) } - if resyncRequired(localStatus) { - err = rbdVol.resyncImage() - if err != nil { - log.ErrorLog(ctx, err.Error()) + err = resyncVolume(localStatus, rbdVol, req.Force) + if err != nil { + log.ErrorLog(ctx, err.Error()) - return nil, status.Error(codes.Internal, err.Error()) - } - - // If we issued a resync, return a non-final error as image needs to be recreated - // locally. Caller retries till RBD syncs an initial version of the image to - // report its status in the resync request. - return nil, status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + return nil, err } err = checkVolumeResyncStatus(localStatus) @@ -887,6 +880,29 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, return resp, nil } +func resyncVolume(localStatus librbd.SiteMirrorImageStatus, rbdVol *rbdVolume, force bool) error { + if resyncRequired(localStatus) { + // If the force option is not set return the error message to retry + // with Force option. + if !force { + return status.Errorf(codes.FailedPrecondition, + "image is in %q state, description (%s). Force resync to recover volume", + localStatus.State, localStatus.Description) + } + err := rbdVol.resyncImage() + if err != nil { + return status.Error(codes.Internal, err.Error()) + } + + // If we issued a resync, return a non-final error as image needs to be recreated + // locally. Caller retries till RBD syncs an initial version of the image to + // report its status in the resync request. + return status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + } + + return nil +} + func checkVolumeResyncStatus(localStatus librbd.SiteMirrorImageStatus) error { // we are considering 2 states to check resync started and resync completed // as below. all other states will be considered as an error state so that