rbd: consider remote image health for primary

To consider the image is healthy during the Promote operation currently we are checking only the image state on the primary site. If the network is flaky or the remote site is down the image health is not as expected. To make sure the image is healthy across the clusters check the state on both local and the remote clusters. some details: https://bugzilla.redhat.com/show_bug.cgi?id=2014495 Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
2025-06-13 02:33:34 +00:00 · 2022-04-11 10:05:07 +05:30
parent b64c7583a9
commit c3c87f2ef3
1 changed files with 23 additions and 2 deletions
--- a/internal/rbd/replicationcontrollerserver.go
+++ b/internal/rbd/replicationcontrollerserver.go
@ -615,8 +615,9 @@ func (rs *ReplicationServer) PromoteVolume(ctx context.Context,
 }

 // checkHealthyPrimary checks if the image is a healhty primary or not.
-// healthy primary image will be in up+stopped state, for states other
-// than this it returns an error message.
+// healthy primary image will be in up+stopped state in local cluster and
+// up+replaying in the remote clusters, for states other than this it returns
+// an error message.
 func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error {
 	mirrorStatus, err := rbdVol.getImageMirroringStatus()
 	if err != nil {
@ -640,6 +641,26 @@ func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error {
 			localStatus.State)
 	}

+	// Remote image should be in up+replaying state.
+	for _, s := range mirrorStatus.SiteStatuses {
+		log.UsefulLog(
+			ctx,
+			"peer site mirrorUUID=%q, daemon up=%t, mirroring state=%q, description=%q and lastUpdate=%d",
+			s.MirrorUUID,
+			s.Up,
+			s.State,
+			s.Description,
+			s.LastUpdate)
+		if s.MirrorUUID != "" {
+			if !s.Up && s.State != librbd.MirrorImageStatusStateReplaying {
+				return fmt.Errorf("remote image %s is not healthy. State is up=%t, state=%q",
+					rbdVol,
+					s.Up,
+					s.State)
+			}
+		}
+	}
+
 	return nil
 }