From 34378aafb8054ff4af0df1e3526c1a5c1bc3258f Mon Sep 17 00:00:00 2001 From: Humble Chirammal Date: Thu, 9 Jun 2022 10:49:50 +0530 Subject: [PATCH 01/12] deploy: make use of latest attacher release v3.5.0 attacher sidecar has a new release: https://github.com/kubernetes-csi/external-attacher/releases/tag/v3.5.0 Signed-off-by: Humble Chirammal --- build.env | 2 +- charts/ceph-csi-rbd/README.md | 2 +- charts/ceph-csi-rbd/values.yaml | 2 +- deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/build.env b/build.env index 142d5693a..be7a52e78 100644 --- a/build.env +++ b/build.env @@ -48,7 +48,7 @@ ROOK_VERSION=v1.9.4 ROOK_CEPH_CLUSTER_IMAGE=quay.io/ceph/ceph:v17 # CSI sidecar version -CSI_ATTACHER_VERSION=v3.4.0 +CSI_ATTACHER_VERSION=v3.5.0 CSI_SNAPSHOTTER_VERSION=v6.0.1 CSI_PROVISIONER_VERSION=v3.1.0 CSI_RESIZER_VERSION=v1.4.0 diff --git a/charts/ceph-csi-rbd/README.md b/charts/ceph-csi-rbd/README.md index bb654e532..f41bddcf1 100644 --- a/charts/ceph-csi-rbd/README.md +++ b/charts/ceph-csi-rbd/README.md @@ -118,7 +118,7 @@ charts and their default values. | `provisioner.provisioner.image.tag` | Specifies image tag | `canary` | | `provisioner.provisioner.image.pullPolicy` | Specifies pull policy | `IfNotPresent` | | `provisioner.attacher.image.repository` | Specifies the csi-attacher image repository URL | `registry.k8s.io/sig-storage/csi-attacher` | -| `provisioner.attacher.image.tag` | Specifies image tag | `v3.4.0` | +| `provisioner.attacher.image.tag` | Specifies image tag | `v3.5.0` | | `provisioner.attacher.image.pullPolicy` | Specifies pull policy | `IfNotPresent` | | `provisioner.attacher.name` | Specifies the name of csi-attacher sidecar | `attacher` | | `provisioner.attacher.enabled` | Specifies whether attacher sidecar is enabled | `true` | diff --git a/charts/ceph-csi-rbd/values.yaml b/charts/ceph-csi-rbd/values.yaml index b18d3443e..d81610812 100644 --- a/charts/ceph-csi-rbd/values.yaml +++ b/charts/ceph-csi-rbd/values.yaml @@ -215,7 +215,7 @@ provisioner: enabled: true image: repository: registry.k8s.io/sig-storage/csi-attacher - tag: v3.4.0 + tag: v3.5.0 pullPolicy: IfNotPresent resources: {} diff --git a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml index b0fe29235..8399ff9e3 100644 --- a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml +++ b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml @@ -86,7 +86,7 @@ spec: - name: socket-dir mountPath: /csi - name: csi-attacher - image: registry.k8s.io/sig-storage/csi-attacher:v3.4.0 + image: registry.k8s.io/sig-storage/csi-attacher:v3.5.0 args: - "--v=5" - "--csi-address=$(ADDRESS)" From fa0da71ce229207b643bbcd18fb89f4c1bdf41a1 Mon Sep 17 00:00:00 2001 From: Humble Chirammal Date: Thu, 16 Jun 2022 11:09:39 +0530 Subject: [PATCH 02/12] deploy: update external resizer to v1.5.0 Refer# https://github.com/kubernetes-csi/external-resizer/releases/tag/v1.5.0 Signed-off-by: Humble Chirammal --- build.env | 2 +- charts/ceph-csi-cephfs/README.md | 2 +- charts/ceph-csi-cephfs/values.yaml | 2 +- charts/ceph-csi-rbd/README.md | 2 +- charts/ceph-csi-rbd/values.yaml | 2 +- deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml | 2 +- deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build.env b/build.env index be7a52e78..87a7ee343 100644 --- a/build.env +++ b/build.env @@ -51,7 +51,7 @@ ROOK_CEPH_CLUSTER_IMAGE=quay.io/ceph/ceph:v17 CSI_ATTACHER_VERSION=v3.5.0 CSI_SNAPSHOTTER_VERSION=v6.0.1 CSI_PROVISIONER_VERSION=v3.1.0 -CSI_RESIZER_VERSION=v1.4.0 +CSI_RESIZER_VERSION=v1.5.0 CSI_NODE_DRIVER_REGISTRAR_VERSION=v2.5.1 # e2e settings diff --git a/charts/ceph-csi-cephfs/README.md b/charts/ceph-csi-cephfs/README.md index 60aa16be6..604e8be53 100644 --- a/charts/ceph-csi-cephfs/README.md +++ b/charts/ceph-csi-cephfs/README.md @@ -109,7 +109,7 @@ charts and their default values. | `provisioner.provisioner.image.tag` | Specifies image tag | `v3.1.0` | | `provisioner.provisioner.image.pullPolicy` | Specifies pull policy | `IfNotPresent` | | `provisioner.resizer.image.repository` | Specifies the csi-resizer image repository URL | `registry.k8s.io/sig-storage/csi-resizer` | -| `provisioner.resizer.image.tag` | Specifies image tag | `v1.4.0` | +| `provisioner.resizer.image.tag` | Specifies image tag | `v1.5.0` | | `provisioner.resizer.image.pullPolicy` | Specifies pull policy | `IfNotPresent` | | `provisioner.resizer.name` | Specifies the name of csi-resizer sidecar | `resizer` | | `provisioner.resizer.enabled` | Specifies whether resizer sidecar is enabled | `true` | diff --git a/charts/ceph-csi-cephfs/values.yaml b/charts/ceph-csi-cephfs/values.yaml index 59902b5e2..b21871a09 100644 --- a/charts/ceph-csi-cephfs/values.yaml +++ b/charts/ceph-csi-cephfs/values.yaml @@ -171,7 +171,7 @@ provisioner: enabled: true image: repository: registry.k8s.io/sig-storage/csi-resizer - tag: v1.4.0 + tag: v1.5.0 pullPolicy: IfNotPresent resources: {} diff --git a/charts/ceph-csi-rbd/README.md b/charts/ceph-csi-rbd/README.md index f41bddcf1..9ffc7c65f 100644 --- a/charts/ceph-csi-rbd/README.md +++ b/charts/ceph-csi-rbd/README.md @@ -123,7 +123,7 @@ charts and their default values. | `provisioner.attacher.name` | Specifies the name of csi-attacher sidecar | `attacher` | | `provisioner.attacher.enabled` | Specifies whether attacher sidecar is enabled | `true` | | `provisioner.resizer.image.repository` | Specifies the csi-resizer image repository URL | `registry.k8s.io/sig-storage/csi-resizer` | -| `provisioner.resizer.image.tag` | Specifies image tag | `v1.4.0` | +| `provisioner.resizer.image.tag` | Specifies image tag | `v1.5.0` | | `provisioner.resizer.image.pullPolicy` | Specifies pull policy | `IfNotPresent` | | `provisioner.resizer.name` | Specifies the name of csi-resizer sidecar | `resizer` | | `provisioner.resizer.enabled` | Specifies whether resizer sidecar is enabled | `true` | diff --git a/charts/ceph-csi-rbd/values.yaml b/charts/ceph-csi-rbd/values.yaml index d81610812..d48af1c32 100644 --- a/charts/ceph-csi-rbd/values.yaml +++ b/charts/ceph-csi-rbd/values.yaml @@ -224,7 +224,7 @@ provisioner: enabled: true image: repository: registry.k8s.io/sig-storage/csi-resizer - tag: v1.4.0 + tag: v1.5.0 pullPolicy: IfNotPresent resources: {} diff --git a/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml b/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml index ed2eeb46f..784494670 100644 --- a/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml +++ b/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml @@ -60,7 +60,7 @@ spec: - name: socket-dir mountPath: /csi - name: csi-resizer - image: registry.k8s.io/sig-storage/csi-resizer:v1.4.0 + image: registry.k8s.io/sig-storage/csi-resizer:v1.5.0 args: - "--csi-address=$(ADDRESS)" - "--v=5" diff --git a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml index 8399ff9e3..b3877bb46 100644 --- a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml +++ b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml @@ -100,7 +100,7 @@ spec: - name: socket-dir mountPath: /csi - name: csi-resizer - image: registry.k8s.io/sig-storage/csi-resizer:v1.4.0 + image: registry.k8s.io/sig-storage/csi-resizer:v1.5.0 args: - "--csi-address=$(ADDRESS)" - "--v=5" From af0cd0fd80db35bb3011ca7455a9f5f49e4b2e87 Mon Sep 17 00:00:00 2001 From: Humble Chirammal Date: Mon, 20 Jun 2022 16:01:01 +0530 Subject: [PATCH 03/12] rebase: use v6.0.1 of snapshot client eventhough the snapshotter is of 6.0.1 the client libraries are of 6.0.0 and this commit update the same. Signed-off-by: Humble Chirammal --- go.mod | 2 +- go.sum | 4 ++-- vendor/modules.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index d3f1d329f..d8407024f 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/hashicorp/vault/api v1.6.0 github.com/kubernetes-csi/csi-lib-utils v0.11.0 - github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.0 + github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1 github.com/libopenstorage/secrets v0.0.0-20210908194121-a1d19aa9713a github.com/onsi/ginkgo v1.16.5 github.com/onsi/gomega v1.19.0 diff --git a/go.sum b/go.sum index 97bacec2d..03b802029 100644 --- a/go.sum +++ b/go.sum @@ -762,8 +762,8 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kubernetes-csi/csi-lib-utils v0.11.0 h1:FHWOBtAZBA/hVk7v/qaXgG9Sxv0/n06DebPFuDwumqg= github.com/kubernetes-csi/csi-lib-utils v0.11.0/go.mod h1:BmGZZB16L18+9+Lgg9YWwBKfNEHIDdgGfAyuW6p2NV0= github.com/kubernetes-csi/external-snapshotter/client/v4 v4.0.0/go.mod h1:YBCo4DoEeDndqvAn6eeu0vWM7QdXmHEeI9cFWplmBys= -github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.0 h1:05QzRsnbMQ1Ymg/7iJj1k6RVw+rgelB60Ud5j4GgmGM= -github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.0/go.mod h1:tnHiLn3P10N95fjn7O40QH5ovN0EFGAxqdTpUMrX6bU= +github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1 h1:OqBS3UAo3eGWplYXoMLaWnx/7Zj5Ogh0VO/FuVOL+/o= +github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1/go.mod h1:tnHiLn3P10N95fjn7O40QH5ovN0EFGAxqdTpUMrX6bU= github.com/layeh/radius v0.0.0-20190322222518-890bc1058917/go.mod h1:fywZKyu//X7iRzaxLgPWsvc0L26IUpVvE/aeIL2JtIQ= github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0= github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= diff --git a/vendor/modules.txt b/vendor/modules.txt index d935b3a9b..292b1baea 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -352,7 +352,7 @@ github.com/kubernetes-csi/csi-lib-utils/connection github.com/kubernetes-csi/csi-lib-utils/metrics github.com/kubernetes-csi/csi-lib-utils/protosanitizer github.com/kubernetes-csi/csi-lib-utils/rpc -# github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.0 +# github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1 ## explicit; go 1.17 github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1 github.com/kubernetes-csi/external-snapshotter/client/v6/clientset/versioned/scheme From 3acaa018dbb971df40b29a848eba1ca0c0420299 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 22 Jun 2022 11:18:10 +0530 Subject: [PATCH 04/12] rbd: issue resync only if the force flag is set During failover we do demote the volume on the primary as the image is still not promoted yet on the remote cluster, there are spurious split-brain errors reported by RBD, the Cephcsi resync will attempt to resync from the "known" secondary and that will cause data loss Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 38 +++++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index f84b0c7a1..ca6935ce0 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -856,18 +856,11 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, ready = checkRemoteSiteStatus(ctx, mirrorStatus) } - if resyncRequired(localStatus) { - err = rbdVol.resyncImage() - if err != nil { - log.ErrorLog(ctx, err.Error()) + err = resyncVolume(localStatus, rbdVol, req.Force) + if err != nil { + log.ErrorLog(ctx, err.Error()) - return nil, status.Error(codes.Internal, err.Error()) - } - - // If we issued a resync, return a non-final error as image needs to be recreated - // locally. Caller retries till RBD syncs an initial version of the image to - // report its status in the resync request. - return nil, status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + return nil, err } err = checkVolumeResyncStatus(localStatus) @@ -887,6 +880,29 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, return resp, nil } +func resyncVolume(localStatus librbd.SiteMirrorImageStatus, rbdVol *rbdVolume, force bool) error { + if resyncRequired(localStatus) { + // If the force option is not set return the error message to retry + // with Force option. + if !force { + return status.Errorf(codes.FailedPrecondition, + "image is in %q state, description (%s). Force resync to recover volume", + localStatus.State, localStatus.Description) + } + err := rbdVol.resyncImage() + if err != nil { + return status.Error(codes.Internal, err.Error()) + } + + // If we issued a resync, return a non-final error as image needs to be recreated + // locally. Caller retries till RBD syncs an initial version of the image to + // report its status in the resync request. + return status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + } + + return nil +} + func checkVolumeResyncStatus(localStatus librbd.SiteMirrorImageStatus) error { // we are considering 2 states to check resync started and resync completed // as below. all other states will be considered as an error state so that From 05ccb31a455d3250266afa45c276c32d4f0fb3e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Jun 2022 20:32:11 +0000 Subject: [PATCH 05/12] rebase: bump actions/dependency-review-action from 1 to 2 Bumps [actions/dependency-review-action](https://github.com/actions/dependency-review-action) from 1 to 2. - [Release notes](https://github.com/actions/dependency-review-action/releases) - [Commits](https://github.com/actions/dependency-review-action/compare/v1...v2) --- updated-dependencies: - dependency-name: actions/dependency-review-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/dependency-review.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dependency-review.yaml b/.github/workflows/dependency-review.yaml index e05efaffd..09bce8cad 100644 --- a/.github/workflows/dependency-review.yaml +++ b/.github/workflows/dependency-review.yaml @@ -17,4 +17,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@v3 - name: 'Dependency Review' - uses: actions/dependency-review-action@v1 + uses: actions/dependency-review-action@v2 From 1da446d2f2496e3e44c94cdfeb531b94c44dee97 Mon Sep 17 00:00:00 2001 From: Prasanna Kumar Kalever Date: Thu, 23 Jun 2022 11:26:10 +0530 Subject: [PATCH 06/12] rbd: healer detect Kubernetes version for right StagingTargetPath Kubernetes 1.24 and newer use a different path for staging the volume. That means the CSI-driver is requested to mount the volume at an other location, compared to previous versions of Kubernetes. CSI-drivers implementing the volumeHealer, must receive the correct path, otherwise the after a nodeplugin restart the NBD mounts will bailout attempting to NodeStageVolume() call and return an error. See-also: kubernetes/kubernetes#107065 Fixes: #3176 Signed-off-by: Prasanna Kumar Kalever --- cmd/cephcsi.go | 2 +- internal/rbd/rbd_healer.go | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/cmd/cephcsi.go b/cmd/cephcsi.go index 480215de2..db5239fe9 100644 --- a/cmd/cephcsi.go +++ b/cmd/cephcsi.go @@ -54,7 +54,7 @@ const ( defaultNS = "default" defaultPluginPath = "/var/lib/kubelet/plugins" - defaultStagingPath = defaultPluginPath + "/kubernetes.io/csi/pv/" + defaultStagingPath = defaultPluginPath + "/kubernetes.io/csi/" ) var conf util.Config diff --git a/internal/rbd/rbd_healer.go b/internal/rbd/rbd_healer.go index 91a54535b..3014f7ffc 100644 --- a/internal/rbd/rbd_healer.go +++ b/internal/rbd/rbd_healer.go @@ -18,6 +18,9 @@ package rbd import ( "context" + "crypto/sha256" + "fmt" + "path/filepath" "sync" "github.com/ceph/ceph-csi/internal/util" @@ -70,11 +73,39 @@ func getSecret(c *k8s.Clientset, ns, name string) (map[string]string, error) { return deviceSecret, nil } +// formatStagingTargetPath returns the path where the volume is expected to be +// mounted (or the block-device is attached/mapped). Different Kubernetes +// version use different paths. +func formatStagingTargetPath(c *k8s.Clientset, pv *v1.PersistentVolume, stagingPath string) (string, error) { + // Kubernetes 1.24+ uses a hash of the volume-id in the path name + unique := sha256.Sum256([]byte(pv.Spec.CSI.VolumeHandle)) + targetPath := filepath.Join(stagingPath, pv.Spec.CSI.Driver, fmt.Sprintf("%x", unique), "globalmount") + + major, minor, err := kubeclient.GetServerVersion(c) + if err != nil { + return "", fmt.Errorf("failed to get server version: %w", err) + } + + // 'encode' major/minor in a single integer + legacyVersion := 1024 // Kubernetes 1.24 => 1 * 1000 + 24 + if ((major * 1000) + minor) < (legacyVersion) { + // path in Kubernetes < 1.24 + targetPath = filepath.Join(stagingPath, "pv", pv.Name, "globalmount") + } + + return targetPath, nil +} + func callNodeStageVolume(ns *NodeServer, c *k8s.Clientset, pv *v1.PersistentVolume, stagingPath string) error { publishContext := make(map[string]string) volID := pv.Spec.PersistentVolumeSource.CSI.VolumeHandle - stagingParentPath := stagingPath + pv.Name + "/globalmount" + stagingParentPath, err := formatStagingTargetPath(c, pv, stagingPath) + if err != nil { + log.ErrorLogMsg("formatStagingTargetPath failed volID: %s, err: %v", volID, err) + + return err + } log.DefaultLog("sending nodeStageVolume for volID: %s, stagingPath: %s", volID, stagingParentPath) From d3650ae8638ef6adff7249e97e06e6de04c8f989 Mon Sep 17 00:00:00 2001 From: Prasanna Kumar Kalever Date: Thu, 23 Jun 2022 11:31:44 +0530 Subject: [PATCH 07/12] deploy: fix the staging path accordingly in the templates Signed-off-by: Prasanna Kumar Kalever --- charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml | 2 +- deploy/rbd/kubernetes/csi-rbdplugin.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml b/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml index decd6e78c..aad875423 100644 --- a/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml +++ b/charts/ceph-csi-rbd/templates/nodeplugin-daemonset.yaml @@ -67,7 +67,7 @@ spec: args: - "--nodeid=$(NODE_ID)" - "--pluginpath={{ .Values.kubeletDir }}/plugins" - - "--stagingpath={{ .Values.kubeletDir }}/plugins/kubernetes.io/csi/pv/" + - "--stagingpath={{ .Values.kubeletDir }}/plugins/kubernetes.io/csi/" - "--type=rbd" - "--nodeserver=true" - "--pidlimit=-1" diff --git a/deploy/rbd/kubernetes/csi-rbdplugin.yaml b/deploy/rbd/kubernetes/csi-rbdplugin.yaml index 68f8fa0ad..b9c894cee 100644 --- a/deploy/rbd/kubernetes/csi-rbdplugin.yaml +++ b/deploy/rbd/kubernetes/csi-rbdplugin.yaml @@ -55,7 +55,7 @@ spec: args: - "--nodeid=$(NODE_ID)" - "--pluginpath=/var/lib/kubelet/plugins" - - "--stagingpath=/var/lib/kubelet/plugins/kubernetes.io/csi/pv/" + - "--stagingpath=/var/lib/kubelet/plugins/kubernetes.io/csi/" - "--type=rbd" - "--nodeserver=true" - "--endpoint=$(CSI_ENDPOINT)" From bf9e14d2c9f161d9f3f6b8250b4b6b83fb001326 Mon Sep 17 00:00:00 2001 From: Rakshith R Date: Fri, 24 Jun 2022 16:36:16 +0530 Subject: [PATCH 08/12] ci: retest only one pr at a time & rebase if necessary This commit improves retest action by rebasing the pr if it behind devel branch and adding retests to only one pr at a time. refer: https://docs.github.com/en/graphql/reference/enums#mergestatestatus Signed-off-by: Rakshith R --- actions/retest/main.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/actions/retest/main.go b/actions/retest/main.go index f019f333f..075cea7e3 100644 --- a/actions/retest/main.go +++ b/actions/retest/main.go @@ -147,6 +147,19 @@ func main() { log.Printf("found context %s with status %s\n", r.GetContext(), r.GetState()) if contains([]string{"failed", "failure"}, r.GetState()) { log.Printf("found failed test %s\n", r.GetContext()) + failedTestFound = true + // rebase the pr if it is behind the devel branch. + if *re.MergeableState == "BEHIND" { + comment := &github.IssueComment{ + Body: github.String("@mergifyio rebase"), + } + _, _, err := c.client.Issues.CreateComment(context.TODO(), c.owner, c.repo, prNumber, comment) + if err != nil { + log.Printf("failed to create comment %v\n", err) + } + break + } + // check if retest limit is reached msg := fmt.Sprintf("/retest %s", r.GetContext()) ok, err := c.checkRetestLimitReached(prNumber, msg) @@ -175,7 +188,6 @@ func main() { log.Printf("failed to create comment %v\n", err) continue } - failedTestFound = true } } @@ -188,8 +200,10 @@ func main() { _, _, err = c.client.Issues.CreateComment(context.TODO(), c.owner, c.repo, prNumber, comment) if err != nil { log.Printf("failed to create comment %q: %v\n", msg, err) - continue } + // exit after adding retests to a pr once to avoid retesting multiple prs + // at the same time. + break } } } From 34ff13984ad7853eb812d65e98ec01c1eff33309 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Mon, 27 Jun 2022 10:14:58 +0200 Subject: [PATCH 09/12] ci: prevent panic in retest action on `nil` strings In case a PullRequest does not have a MergeableState set, it will be `nil`. Dereferencing the pointer will cause a Go panic, and the action won't work as intended. Signed-off-by: Niels de Vos --- actions/retest/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/actions/retest/main.go b/actions/retest/main.go index 075cea7e3..26ef0b610 100644 --- a/actions/retest/main.go +++ b/actions/retest/main.go @@ -116,7 +116,7 @@ func main() { log.Fatalf("failed to list pull requests %v\n", err) } for _, re := range req { - if *re.State == "open" { + if (re.State != nil) && (*re.State == "open") { prNumber := re.GetNumber() log.Printf("PR with ID %d with Title %q is open\n", prNumber, re.GetTitle()) for _, l := range re.Labels { @@ -149,7 +149,7 @@ func main() { log.Printf("found failed test %s\n", r.GetContext()) failedTestFound = true // rebase the pr if it is behind the devel branch. - if *re.MergeableState == "BEHIND" { + if (re.MergeableState != nil) && (*re.MergeableState == "BEHIND") { comment := &github.IssueComment{ Body: github.String("@mergifyio rebase"), } From 704cb5c941f7850800286e955c2db1de409ca35b Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Mon, 27 Jun 2022 18:38:36 +0530 Subject: [PATCH 10/12] revert: rbd: consider remote image health for primary When the image is force promoted to primary on the cluster the remote image might not be in replaying state because due to the split brain state. This PR reverts back the commit c3c87f2ef33e8d8ad08d7d9f28b59d1aedc4ef31. Which we added to check the remote image status. Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 25 ++------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index ca6935ce0..26ba62654 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -616,9 +616,8 @@ func (rs *ReplicationServer) PromoteVolume(ctx context.Context, } // checkHealthyPrimary checks if the image is a healhty primary or not. -// healthy primary image will be in up+stopped state in local cluster and -// up+replaying in the remote clusters, for states other than this it returns -// an error message. +// healthy primary image will be in up+stopped state, for states other +// than this it returns an error message. func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { mirrorStatus, err := rbdVol.getImageMirroringStatus() if err != nil { @@ -642,26 +641,6 @@ func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { localStatus.State) } - // Remote image should be in up+replaying state. - for _, s := range mirrorStatus.SiteStatuses { - log.UsefulLog( - ctx, - "peer site mirrorUUID=%q, daemon up=%t, mirroring state=%q, description=%q and lastUpdate=%d", - s.MirrorUUID, - s.Up, - s.State, - s.Description, - s.LastUpdate) - if s.MirrorUUID != "" { - if !s.Up && s.State != librbd.MirrorImageStatusStateReplaying { - return fmt.Errorf("remote image %s is not healthy. State is up=%t, state=%q", - rbdVol, - s.Up, - s.State) - } - } - } - return nil } From 53e76fab692750bb74e630ab25bb7052b366b420 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Tue, 28 Jun 2022 16:17:02 +0530 Subject: [PATCH 11/12] rbd: fix checkHealthyPrimary to consider up+stopped state we need to check for image should be in up+stopped state not anyone of the state for that the we need to use OR check not the AND check. Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index 26ba62654..c940c019c 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -633,7 +633,7 @@ func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { return fmt.Errorf("failed to get local status: %w", err) } - if !localStatus.Up && localStatus.State != librbd.MirrorImageStatusStateStopped { + if !localStatus.Up || localStatus.State != librbd.MirrorImageStatusStateStopped { return fmt.Errorf("%s %w. State is up=%t, state=%q", rbdVol, ErrUnHealthyMirroredImage, From 8a47904e8f1a22496b8af79d22a4210a7b06aad8 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Tue, 28 Jun 2022 13:52:05 +0530 Subject: [PATCH 12/12] rbd: add unit test for checkHealthyPrimary Removed the code in checkHealthyPrimary which makes the ceph call, passing it as input now. Added unit test for checkHealthyPrimary function Signed-off-by: Madhu Rajanna --- internal/rbd/replicationcontrollerserver.go | 13 ++-- .../rbd/replicationcontrollerserver_test.go | 72 +++++++++++++++++++ 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index c940c019c..117885345 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -581,7 +581,12 @@ func (rs *ReplicationServer) PromoteVolume(ctx context.Context, } } - err = checkHealthyPrimary(ctx, rbdVol) + mirrorStatus, err := rbdVol.getImageMirroringStatus() + if err != nil { + return nil, status.Error(codes.Internal, err.Error()) + } + + err = checkHealthyPrimary(ctx, rbdVol, mirrorStatus) if err != nil { return nil, status.Error(codes.Internal, err.Error()) } @@ -618,11 +623,7 @@ func (rs *ReplicationServer) PromoteVolume(ctx context.Context, // checkHealthyPrimary checks if the image is a healhty primary or not. // healthy primary image will be in up+stopped state, for states other // than this it returns an error message. -func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume) error { - mirrorStatus, err := rbdVol.getImageMirroringStatus() - if err != nil { - return err - } +func checkHealthyPrimary(ctx context.Context, rbdVol *rbdVolume, mirrorStatus *librbd.GlobalMirrorImageStatus) error { localStatus, err := mirrorStatus.LocalStatus() if err != nil { // LocalStatus can fail if the local site status is not found in diff --git a/internal/rbd/replicationcontrollerserver_test.go b/internal/rbd/replicationcontrollerserver_test.go index 401292587..87f60a38f 100644 --- a/internal/rbd/replicationcontrollerserver_test.go +++ b/internal/rbd/replicationcontrollerserver_test.go @@ -279,3 +279,75 @@ func TestCheckVolumeResyncStatus(t *testing.T) { }) } } + +func Test_checkHealthyPrimary(t *testing.T) { + t.Parallel() + rbdVol := &rbdVolume{ + rbdImage: rbdImage{ + RbdImageName: "test", + Pool: "test-pool", + }, + } + tests := []struct { + name string + ctx context.Context + rbdVol *rbdVolume + mirrorStatus *librbd.GlobalMirrorImageStatus + wantErr bool + }{ + { + name: "when image is in up+stopped state", + ctx: context.TODO(), + rbdVol: rbdVol, + mirrorStatus: &librbd.GlobalMirrorImageStatus{ + SiteStatuses: []librbd.SiteMirrorImageStatus{ + { + MirrorUUID: "", + State: librbd.MirrorImageStatusStateStopped, + Up: true, + }, + }, + }, + wantErr: false, + }, + { + name: "when image is in up+error state", + ctx: context.TODO(), + rbdVol: rbdVol, + mirrorStatus: &librbd.GlobalMirrorImageStatus{ + SiteStatuses: []librbd.SiteMirrorImageStatus{ + { + MirrorUUID: "", + State: librbd.MirrorImageStatusStateError, + Up: false, + }, + }, + }, + wantErr: true, + }, + { + name: "when image is in up+replaying state", + ctx: context.TODO(), + rbdVol: rbdVol, + mirrorStatus: &librbd.GlobalMirrorImageStatus{ + SiteStatuses: []librbd.SiteMirrorImageStatus{ + { + MirrorUUID: "", + State: librbd.MirrorImageStatusStateReplaying, + Up: true, + }, + }, + }, + wantErr: true, + }, + } + for _, tt := range tests { + ts := tt + t.Run(ts.name, func(t *testing.T) { + t.Parallel() + if err := checkHealthyPrimary(ts.ctx, ts.rbdVol, ts.mirrorStatus); (err != nil) != ts.wantErr { + t.Errorf("checkHealthyPrimary() error = %v, wantErr %v", err, ts.wantErr) + } + }) + } +}