mirror of
https://github.com/ceph/ceph-csi.git
synced 2024-12-24 14:00:19 +00:00
rbd: mark image ready when image state is up+unknown
To recover from split brain (up+error) state the image need to be
demoted and requested for resync on site-a and then the image on site-b
should gets demoted.The volume should be marked to ready=true when the
image state on both the clusters are up+unknown because during the last
snapshot syncing the data gets copied first and then image state on the
site-a changes to up+unknown.
If the image state on both the sites are up+unknown consider that
complete data is synced as the last snapshot
gets exchanged between the clusters.
* create 10 GB of file and validate the data after resync
* Do Failover when the site-a goes down
* Force promote the image and write data in GiB
* Once the site-a comes back, Demote the image and issue resync
* Demote the image on site-b
* The status will get reflected on the other site when the last
snapshot sync happens
* The image will go to up+unknown state. and complete data will
be copied to site a
* Promote the image on site-a and use it
```bash
csi-vol-5633715e-a7eb-11eb-bebb-0242ac110006:
global_id: e7f9ec55-06ab-46cb-a1ae-784be75ed96d
state: up+unknown
description: remote image demoted
service: a on minicluster1
last_update: 2021-04-28 07:11:56
peer_sites:
name: e47e29f4-96e8-44ed-b6c6-edf15c5a91d6-rook-ceph
state: up+unknown
description: remote image demoted
last_update: 2021-04-28 07:11:41
```
* Do Failover when the site-a goes down
* Force promote the image on site-b and write data in GiB
* Demote the image on site-b
* Once the site-a comes back, Demote the image on site-a
* The images on the both site will go to split brain state
```bash
csi-vol-37effcb5-a7f1-11eb-bebb-0242ac110006:
global_id: 115c3df9-3d4f-4c04-93a7-531b82155ddf
state: up+error
description: split-brain
service: a on minicluster2
last_update: 2021-04-28 07:25:41
peer_sites:
name: abbda0f0-0117-4425-8cb2-deb4c853da47-rook-ceph
state: up+error
description: split-brain
last_update: 2021-04-28 07:25:26
```
* Issue resync
* The images cannot be resynced because when we issue resync
on site a the image on site-b was in demoted state
* To recover from this state (promote and then demote the
image on site-b after sometime)
```bash
csi-vol-37effcb5-a7f1-11eb-bebb-0242ac110006:
global_id: 115c3df9-3d4f-4c04-93a7-531b82155ddf
state: up+unknown
description: remote image demoted
service: a on minicluster1
last_update: 2021-04-28 07:32:56
peer_sites:
name: e47e29f4-96e8-44ed-b6c6-edf15c5a91d6-rook-ceph
state: up+unknown
description: remote image demoted
last_update: 2021-04-28 07:32:41
```
* Once the data is copied we can see that the image state
is moved to up+unknown on both sites
* Promote the image on site-a and use it
Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
(cherry picked from commit 07a916b84d
)
This commit is contained in:
parent
1c59f0683e
commit
75fa1927fc
@ -43,15 +43,6 @@ const (
|
||||
type imageMirroringState string
|
||||
|
||||
const (
|
||||
// If the state is up+replaying, then mirroring is functioning properly.
|
||||
// up means the rbd-mirror daemon is running, and replaying means
|
||||
// this image is the target for replication from another storage cluster.
|
||||
upAndReplaying imageMirroringState = "up+replaying"
|
||||
// If the state is up+stopped means the rbd-mirror daemon is
|
||||
// running and stopped means the image is not a target for replication from
|
||||
// another cluster
|
||||
upAndStopped imageMirroringState = "up+stopped"
|
||||
|
||||
// If the state is up+unknown means the rbd-mirror daemon is
|
||||
// running and the image is demoted on both the clusters.
|
||||
upAndUnknown imageMirroringState = "up+unknown"
|
||||
@ -445,20 +436,16 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context,
|
||||
}
|
||||
ready := false
|
||||
state := imageMirroringState(mirrorStatus.State)
|
||||
if state == upAndStopped || state == upAndReplaying {
|
||||
// Make sure the peer site image state is up and stopped
|
||||
ready = true
|
||||
for _, s := range mirrorStatus.PeerSites {
|
||||
if imageMirroringState(s.State) != upAndStopped {
|
||||
util.UsefulLog(ctx, "peer site name=%s, mirroring state=%s, description=%s and lastUpdate=%s", s.SiteName, s.State, s.Description, s.LastUpdate)
|
||||
ready = false
|
||||
}
|
||||
}
|
||||
}
|
||||
// To recover from split brain (up+error) state the image need to be
|
||||
// demoted and requested for resync on site-a and then the image on site-b
|
||||
// should be demoted. The volume should be marked to ready=true when the
|
||||
// image state on both the clusters are up+unknown because during the last
|
||||
// snapshot syncing the data gets copied first and then image state on the
|
||||
// site-a changes to up+unknown.
|
||||
|
||||
// when the images are demoted on both clusters and user requests for the
|
||||
// resync of the image, the image mirror state will be unknown state in
|
||||
// both clusters.
|
||||
// If the image state on both the sites are up+unknown consider that
|
||||
// complete data is synced as the last snapshot
|
||||
// gets exchanged between the clusters.
|
||||
if state == upAndUnknown {
|
||||
ready = true
|
||||
for _, s := range mirrorStatus.PeerSites {
|
||||
|
Loading…
Reference in New Issue
Block a user