From 8d7b6ee59f8b2ac6bfb3e096fc4d10732aeeb3ac Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Mon, 8 Aug 2022 17:23:35 +0530 Subject: [PATCH] rbd: consider mirror deamon state for ResyncVolume During ResyncVolume we check if the image is in an error state, and we resync. After resync, the image will move to either the `Error` or the `Resyncing` state. And if the image is in the above two conditions, we will return a successful response and Ready=false so that the consumer can wait until the volume is ready to use. If the image is in any other state we return an error message to indicate the syncing is not going on. The whole resync and image state change depends on the rbd mirror daemon. If the mirror daemon is not running, the image can be in Resyncing or Unknown state. The Ramen marks the volume replication as secondary, and once the resync starts, it will delete the volume replication CR as a cleanup process. As we dont have a check for the rbd mirror daemon, we are returning a resync success response and Ready=false. Due to this false response Ramen is assuming the resync started and deleted the volume replication CR, and because of this, the cluster goes into a bad state and needs manual intervention. fixes #3289 Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 11 ++++++----- internal/rbd/replicationcontrollerserver_test.go | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index e93887686..7393c2fdd 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -856,12 +856,13 @@ func checkVolumeResyncStatus(localStatus librbd.SiteMirrorImageStatus) error { // If the state is Replaying means the resync is going on. // Once the volume on remote cluster is demoted and resync - // is completed the image state will be moved to UNKNOWN . - if localStatus.State != librbd.MirrorImageStatusStateReplaying && - localStatus.State != librbd.MirrorImageStatusStateUnknown { + // is completed the image state will be moved to UNKNOWN. + // RBD mirror daemon should be always running on the primary cluster. + if !localStatus.Up || (localStatus.State != librbd.MirrorImageStatusStateReplaying && + localStatus.State != librbd.MirrorImageStatusStateUnknown) { return fmt.Errorf( - "not resyncing. image is in %q state", - localStatus.State) + "not resyncing. Local status: daemon up=%t image is in %q state", + localStatus.Up, localStatus.State) } return nil diff --git a/internal/rbd/replicationcontrollerserver_test.go b/internal/rbd/replicationcontrollerserver_test.go index 401292587..e10d35249 100644 --- a/internal/rbd/replicationcontrollerserver_test.go +++ b/internal/rbd/replicationcontrollerserver_test.go @@ -212,10 +212,19 @@ func TestCheckVolumeResyncStatus(t *testing.T) { args librbd.SiteMirrorImageStatus wantErr bool }{ + { + name: "test when rbd mirror daemon is not running", + args: librbd.SiteMirrorImageStatus{ + State: librbd.MirrorImageStatusStateUnknown, + Up: false, + }, + wantErr: true, + }, { name: "test for unknown state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateUnknown, + Up: true, }, wantErr: false, }, @@ -223,6 +232,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for error state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateError, + Up: true, }, wantErr: true, }, @@ -230,6 +240,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for syncing state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateSyncing, + Up: true, }, wantErr: true, }, @@ -237,6 +248,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for starting_replay state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateStartingReplay, + Up: true, }, wantErr: true, }, @@ -244,6 +256,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for replaying state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateReplaying, + Up: true, }, wantErr: false, }, @@ -251,6 +264,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for stopping_replay state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateStoppingReplay, + Up: true, }, wantErr: true, }, @@ -258,6 +272,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for stopped state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusStateStopped, + Up: true, }, wantErr: true, }, @@ -265,6 +280,7 @@ func TestCheckVolumeResyncStatus(t *testing.T) { name: "test for invalid state", args: librbd.SiteMirrorImageStatus{ State: librbd.MirrorImageStatusState(100), + Up: true, }, wantErr: true, },