From 95387c3b5effc6552b72f2b26d077dbae303c90a Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Thu, 18 Mar 2021 12:46:36 +0530 Subject: [PATCH] rbd: check for peer site status Do resync if the image is in unknow or in error state. Check for the current image state for up+stopped or up+replaying and also all peer site status should be un up+stopped to confirm that resyncing is done and image can be promoted and used. Signed-off-by: Madhu Rajanna --- internal/rbd/mirror.go | 6 +++++ internal/rbd/replicationcontrollerserver.go | 27 ++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/internal/rbd/mirror.go b/internal/rbd/mirror.go index d37b85333..2c231ac94 100644 --- a/internal/rbd/mirror.go +++ b/internal/rbd/mirror.go @@ -118,6 +118,12 @@ type imageMirrorStatus struct { State string `json:"state"` // rbd image state Description string `json:"description"` LastUpdate string `json:"last_update"` + PeerSites []struct { + SiteName string `json:"site_name"` + State string `json:"state"` + Description string `json:"description"` + LastUpdate string `json:"last_update"` + } `json:"peer_sites"` } // FIXME: once https://github.com/ceph/go-ceph/issues/460 is fixed use go-ceph. diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index bf440c837..c68d88baf 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -20,6 +20,7 @@ import ( "context" "errors" "strconv" + "strings" "github.com/ceph/ceph-csi/internal/util" @@ -50,6 +51,9 @@ const ( // running and stopped means the image is not a target for replication from // another cluster upAndStopped imageMirroringState = "up+stopped" + + // If the state is error means image need resync. + errorState imageMirroringState = "error" ) const ( @@ -417,12 +421,6 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, return nil, status.Error(codes.InvalidArgument, "image is in primary state") } - err = rbdVol.resyncImage() - if err != nil { - util.ErrorLog(ctx, err.Error()) - return nil, status.Error(codes.Internal, err.Error()) - } - // TODO: check the image state and return its ready to use or not mirrorStatus, err := rbdVol.getImageMirroingStatus() if err != nil { // the image gets recreated after issuing resync in that case return @@ -439,8 +437,25 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, ready := false state := imageMirroringState(mirrorStatus.State) if state == upAndStopped || state == upAndReplaying { + // Make sure the peer site image state is up and stopped ready = true + for _, s := range mirrorStatus.PeerSites { + if imageMirroringState(s.State) != upAndStopped { + util.UsefulLog(ctx, "peer site name=%s mirroring state=%s, description=%s and lastUpdate=%s", s.SiteName, s.State, s.Description, s.LastUpdate) + ready = false + } + } } + + // resync only if the image is in error state + if strings.Contains(mirrorStatus.State, string(errorState)) { + err = rbdVol.resyncImage() + if err != nil { + util.ErrorLog(ctx, err.Error()) + return nil, status.Error(codes.Internal, err.Error()) + } + } + util.UsefulLog(ctx, "image mirroring state=%s, description=%s and lastUpdate=%s", mirrorStatus.State, mirrorStatus.Description, mirrorStatus.LastUpdate) resp := &replication.ResyncVolumeResponse{ Ready: ready,