From 64a9b1fa5906d65478cdb5fb244b133bc1b1cfbe Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Mon, 11 Apr 2022 10:05:07 +0530 Subject: [PATCH] rbd: consider remote image health for primary To consider the image is healthy during the Promote operation currently we are checking only the image state on the primary site. If the network is flaky or the remote site is down the image health is not as expected. To make sure the image is healthy across the clusters check the state on both local and the remote clusters. some details: https://bugzilla.redhat.com/show_bug.cgi?id=2014495 Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 25 +++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index 7ce3b4c98..edeb0c0d3 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -615,8 +615,9 @@ func (rs *ReplicationServer) PromoteVolume(ctx context.Context, } // checkHealthyPrimary checks if the image is a healhty primary or not. -// healthy primary image will be in up+stopped state, for states other -// than this it returns an error message. +// healthy primary image will be in up+stopped state in local cluster and +// up+replaying in the remote clusters, for states other than this it returns +// an error message. func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { mirrorStatus, err := rbdVol.getImageMirroringStatus() if err != nil { @@ -640,6 +641,26 @@ func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { localStatus.State) } + // Remote image should be in up+replaying state. + for _, s := range mirrorStatus.SiteStatuses { + log.UsefulLog( + ctx, + "peer site mirrorUUID=%q, daemon up=%t, mirroring state=%q, description=%q and lastUpdate=%d", + s.MirrorUUID, + s.Up, + s.State, + s.Description, + s.LastUpdate) + if s.MirrorUUID != "" { + if !s.Up && s.State != librbd.MirrorImageStatusStateReplaying { + return fmt.Errorf("remote image %s is not healthy. State is up=%t, state=%q", + rbdVol, + s.Up, + s.State) + } + } + } + return nil }