From ca12592d5cb01716a64ecbdb0c7681ec4c213026 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 12 Oct 2022 09:21:16 +0200 Subject: [PATCH 1/7] doc: remove nfs daemonset deletion As we dont need to delete the nfs daemonset which was present in 3.6.x release in 3.8.x release as user will upgrade from 3.6.x to 3.7.x and delete the nfs daemonset. fixes #3324 Signed-off-by: Madhu Rajanna --- docs/ceph-csi-upgrade.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/ceph-csi-upgrade.md b/docs/ceph-csi-upgrade.md index ab4105ce8..c0ccef647 100644 --- a/docs/ceph-csi-upgrade.md +++ b/docs/ceph-csi-upgrade.md @@ -33,7 +33,6 @@ - [6. Upgrade NFS Nodeplugin resources](#6-upgrade-nfs-nodeplugin-resources) - [6.1 Update the NFS Nodeplugin RBAC](#61-update-the-nfs-nodeplugin-rbac) - [6.2 Update the NFS Nodeplugin daemonset](#62-update-the-nfs-nodeplugin-daemonset) - - [6.3 Delete the old NFS Nodeplugin daemonset](#63-delete-the-old-nfs-nodeplugin-daemonset) - [CSI Sidecar containers consideration](#csi-sidecar-containers-consideration) ## Pre-upgrade considerations @@ -392,13 +391,6 @@ daemonset.apps/csi-nfsplugin configured service/csi-metrics-nfsplugin configured ``` -##### 6.3 Delete the old NFS Nodeplugin daemonset - -```bash -$ kubectl delete daemonsets.apps csi-nfs-node -daemonset.apps "csi-nfs-node" deleted -``` - we have successfully upgraded nfs csi from v3.6 to v3.7 ### CSI Sidecar containers consideration From 0cba72485cb0fdf709a43e18d665a79b0e790e9f Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Fri, 7 Oct 2022 17:50:27 +0200 Subject: [PATCH 2/7] ci: add support for VM_DRIVER=podman to scripts/minikube.sh When running on AWE EC2 virtual-machines, we'll use Podman instead of installing a VM. The "none" driver might work as well, but it requires additional dependencies to be installed, which may change over time with new minikube or Kubernetes releases. Hopefully the Podman driver is less affected with changes in dependencies. Depends-on: #3419 Closes: #3415 Signed-off-by: Niels de Vos --- scripts/minikube.sh | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/scripts/minikube.sh b/scripts/minikube.sh index b73faf054..1def9a369 100755 --- a/scripts/minikube.sh +++ b/scripts/minikube.sh @@ -22,7 +22,7 @@ function copy_image_to_cluster() { if [ -z "$(${CONTAINER_CMD} images -q "${build_image}")" ]; then ${CONTAINER_CMD} pull "${build_image}" fi - if [[ "${VM_DRIVER}" == "none" ]]; then + if [[ "${VM_DRIVER}" == "none" ]] || [[ "${VM_DRIVER}" == "podman" ]]; then ${CONTAINER_CMD} tag "${build_image}" "${final_image}" return fi @@ -139,6 +139,36 @@ function validate_sidecar() { done } +# install_podman_wrapper creates /usr/bin/podman.wrapper which adds /sys +# filesystem mount points when a privileged container is started. This makes it +# possible to map RBD devices in the container that minikube creates when +# VM_DRIVER=podman is used. +function install_podman_wrapper() { + if [[ -e /usr/bin/podman.wrapper ]] + then + return + fi + + # disabled single quoted check, the script should be created as is + # shellcheck disable=SC2016 + echo '#!/bin/sh +if [[ "${1}" = run ]] +then + if (echo "${@}" | grep -q privileged) + then + shift + exec /usr/bin/podman.real run -v /sys:/sys:rw -v /dev:/dev:rw --systemd=true "${@}" + fi +fi + +exec /usr/bin/podman.real "${@}" +' > /usr/bin/podman.wrapper + chmod +x /usr/bin/podman.wrapper + + mv /usr/bin/podman /usr/bin/podman.real + ln -s podman.wrapper /usr/bin/podman +} + # Storage providers and the default storage class is not needed for Ceph-CSI # testing. In order to reduce resources and potential conflicts between storage # plugins, disable them. @@ -185,7 +215,7 @@ K8S_FEATURE_GATES=${K8S_FEATURE_GATES:-""} # kubelet.resolv-conf needs to point to a file, not a symlink # the default minikube VM has /etc/resolv.conf -> /run/systemd/resolve/resolv.conf RESOLV_CONF='/run/systemd/resolve/resolv.conf' -if [[ "${VM_DRIVER}" == "none" ]] && [[ ! -e "${RESOLV_CONF}" ]]; then +if { [[ "${VM_DRIVER}" == "none" ]] || [[ "${VM_DRIVER}" == "podman" ]]; } && [[ ! -e "${RESOLV_CONF}" ]]; then # in case /run/systemd/resolve/resolv.conf does not exist, use the # standard /etc/resolv.conf (with symlink resolved) RESOLV_CONF="$(readlink -f /etc/resolv.conf)" @@ -216,6 +246,8 @@ up) if [[ "${VM_DRIVER}" == "none" ]]; then mkdir -p "$HOME"/.kube "$HOME"/.minikube install_kubectl + elif [[ "${VM_DRIVER}" == "podman" ]]; then + install_podman_wrapper fi disable_storage_addons @@ -234,11 +266,14 @@ up) # create a link so the default dataDirHostPath will work for this # environment - if [[ "${VM_DRIVER}" != "none" ]]; then + if [[ "${VM_DRIVER}" != "none" ]] && [[ "${VM_DRIVER}" != "podman" ]]; then wait_for_ssh # shellcheck disable=SC2086 ${minikube} ssh "sudo mkdir -p /mnt/${DISK}/var/lib/rook;sudo ln -s /mnt/${DISK}/var/lib/rook /var/lib/rook" fi + if [[ "${VM_DRIVER}" = "podman" ]]; then + ${minikube} ssh "sudo mount -oremount,rw /sys" + fi ${minikube} kubectl -- cluster-info ;; down) From 8f915576c4d70accb3ce7914827184e6f21abc5f Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 12 Oct 2022 16:42:46 +0200 Subject: [PATCH 3/7] e2e: wait for deployment before scale down/up The scale down/up functions fail often with "deployment not found" errors. Possibly deploying with Podman is slower than deploying in a minikube VM, and there is a delay for the deployment to become available. Signed-off-by: Niels de Vos --- e2e/deployment.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/e2e/deployment.go b/e2e/deployment.go index faea048fe..ea7ce66dc 100644 --- a/e2e/deployment.go +++ b/e2e/deployment.go @@ -124,6 +124,9 @@ func waitForDeploymentInAvailableState(clientSet kubernetes.Interface, name, ns if isRetryableAPIError(err) { return false, nil } + if apierrs.IsNotFound(err) { + return false, nil + } e2elog.Logf("%q deployment to be Available (%d seconds elapsed)", name, int(time.Since(start).Seconds())) return false, err @@ -390,6 +393,12 @@ func waitForContainersArgsUpdate( ) error { e2elog.Logf("waiting for deployment updates %s/%s", ns, deploymentName) + // wait for the deployment to be available + err := waitForDeploymentInAvailableState(c, deploymentName, ns, deployTimeout) + if err != nil { + return fmt.Errorf("deployment %s/%s did not become available yet: %w", ns, deploymentName, err) + } + // Scale down to 0. scale, err := c.AppsV1().Deployments(ns).GetScale(context.TODO(), deploymentName, metav1.GetOptions{}) if err != nil { From fa97875dc9b47fb1b4127c037f998ed3e1b1b67a Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 12 Oct 2022 18:23:12 +0200 Subject: [PATCH 4/7] ci: fail installing Helm if `wget` is unavailable In case `wget` is not installed, downloading the Helm release will fail. The `install-helm.sh` script won't return a fatal error in that case, and CI jobs continue running in an environment that is not ready. By adding a check that exist the script with a failure, the CI will now correctly report a problem when Helm can not be downloaded. See-also: #3430 Signed-off-by: Niels de Vos --- scripts/install-helm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install-helm.sh b/scripts/install-helm.sh index a6b1cb331..cf3a8ad44 100755 --- a/scripts/install-helm.sh +++ b/scripts/install-helm.sh @@ -136,7 +136,7 @@ install() { mkdir -p ${TEMP} # shellcheck disable=SC2021 dist=$(echo "${dist}" | tr "[A-Z]" "[a-z]") - wget "https://get.helm.sh/helm-${HELM_VERSION}-${dist}-${arch}.tar.gz" -O "${TEMP}/helm.tar.gz" + wget "https://get.helm.sh/helm-${HELM_VERSION}-${dist}-${arch}.tar.gz" -O "${TEMP}/helm.tar.gz" || exit 1 tar -C "${TEMP}" -zxvf "${TEMP}/helm.tar.gz" fi echo "Helm install successful" From 8eaf1d790d0cddbaded1251cde53e538f77ce484 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 12 Oct 2022 18:57:51 +0200 Subject: [PATCH 5/7] e2e: log failures while deleting PVC and PV There are occasions where deleting a PVC (or PV) never succeeds. The reported status of the deleted object is sometimes empty, which suggests that the PVC or PV was, in fact, deleted. To diagnose the incorrect error checking, include the errors for retrying in the logs. Signed-off-by: Niels de Vos --- e2e/pvc.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/e2e/pvc.go b/e2e/pvc.go index 41bcb4845..24d07673a 100644 --- a/e2e/pvc.go +++ b/e2e/pvc.go @@ -284,9 +284,13 @@ func deletePVCAndValidatePV(c kubernetes.Interface, pvc *v1.PersistentVolumeClai int(time.Since(start).Seconds())) pvc, err = c.CoreV1().PersistentVolumeClaims(nameSpace).Get(context.TODO(), name, metav1.GetOptions{}) if err == nil { + e2elog.Logf("PVC %s (status: %s) has not been deleted yet, rechecking...", name, pvc.Status) + return false, nil } if isRetryableAPIError(err) { + e2elog.Logf("failed to verify deletion of PVC %s (status: %s): %v", name, pvc.Status, err) + return false, nil } if !apierrs.IsNotFound(err) { @@ -294,11 +298,15 @@ func deletePVCAndValidatePV(c kubernetes.Interface, pvc *v1.PersistentVolumeClai } // Examine the pv.ClaimRef and UID. Expect nil values. - _, err = c.CoreV1().PersistentVolumes().Get(context.TODO(), pv.Name, metav1.GetOptions{}) + oldPV, err := c.CoreV1().PersistentVolumes().Get(context.TODO(), pv.Name, metav1.GetOptions{}) if err == nil { + e2elog.Logf("PV %s (status: %s) has not been deleted yet, rechecking...", pv.Name, oldPV.Status) + return false, nil } if isRetryableAPIError(err) { + e2elog.Logf("failed to verify deletion of PV %s (status: %s): %v", pv.Name, oldPV.Status, err) + return false, nil } if !apierrs.IsNotFound(err) { From 386d3ddd6e2d9b63287550a1fdb2e861c11ab9fb Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 13 Oct 2022 10:22:55 +0200 Subject: [PATCH 6/7] e2e: disable rbd-nbd tests by default Because the rbd-nbd tests fail with minikube and the Podman driver, disable the tests for the time being. Updates: #3431 Signed-off-by: Niels de Vos --- e2e/e2e_test.go | 1 + e2e/rbd.go | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ e2e/utils.go | 1 + 3 files changed, 56 insertions(+) diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 90ddd54ea..338a776b6 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -39,6 +39,7 @@ func init() { flag.BoolVar(&deployNFS, "deploy-nfs", false, "deploy nfs csi driver") flag.BoolVar(&testCephFS, "test-cephfs", true, "test cephFS csi driver") flag.BoolVar(&testRBD, "test-rbd", true, "test rbd csi driver") + flag.BoolVar(&testNBD, "test-nbd", false, "test rbd csi driver with rbd-nbd mounter") flag.BoolVar(&testNFS, "test-nfs", false, "test nfs csi driver") flag.BoolVar(&helmTest, "helm-test", false, "tests running on deployment via helm") flag.BoolVar(&upgradeTesting, "upgrade-testing", false, "perform upgrade testing") diff --git a/e2e/rbd.go b/e2e/rbd.go index 83723c09e..4383d1774 100644 --- a/e2e/rbd.go +++ b/e2e/rbd.go @@ -1047,6 +1047,12 @@ var _ = Describe("RBD", func() { }) By("create a PVC and bind it to an app using rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -1083,6 +1089,12 @@ var _ = Describe("RBD", func() { }) By("Resize rbd-nbd PVC and check application directory size", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + if util.CheckKernelSupport(kernelRelease, nbdResizeSupport) { err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { @@ -1290,6 +1302,12 @@ var _ = Describe("RBD", func() { By("create PVC with journaling,fast-diff image-features and bind it to an app using rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + if util.CheckKernelSupport(kernelRelease, fastDiffSupport) { err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { @@ -1330,6 +1348,12 @@ var _ = Describe("RBD", func() { // NOTE: RWX is restricted for FileSystem VolumeMode at ceph-csi, // see pull#261 for more details. By("Create RWX+Block Mode PVC and bind to multiple pods via deployment using rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -1415,6 +1439,12 @@ var _ = Describe("RBD", func() { }) By("Create ROX+FS Mode PVC and bind to multiple pods via deployment using rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -1540,6 +1570,12 @@ var _ = Describe("RBD", func() { }) By("Create ROX+Block Mode PVC and bind to multiple pods via deployment using rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -1666,6 +1702,12 @@ var _ = Describe("RBD", func() { }) By("perform IO on rbd-nbd volume after nodeplugin restart", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -1830,6 +1872,12 @@ var _ = Describe("RBD", func() { }) By("create a PVC and bind it to an app using rbd-nbd mounter with encryption", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) @@ -2199,6 +2247,12 @@ var _ = Describe("RBD", func() { By( "create a PVC and Bind it to an app with journaling/exclusive-lock image-features and rbd-nbd mounter", func() { + if !testNBD { + e2elog.Logf("skipping NBD test") + + return + } + err := deleteResource(rbdExamplePath + "storageclass.yaml") if err != nil { e2elog.Failf("failed to delete storageclass: %v", err) diff --git a/e2e/utils.go b/e2e/utils.go index 0098a6528..58ed04efa 100644 --- a/e2e/utils.go +++ b/e2e/utils.go @@ -85,6 +85,7 @@ var ( deployNFS bool testCephFS bool testRBD bool + testNBD bool testNFS bool helmTest bool upgradeTesting bool From b7703faf37f5905f8e1c83f0c15a01df6cdbb181 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Mon, 3 Oct 2022 17:46:52 +0200 Subject: [PATCH 7/7] util: make inode metrics optional in FilesystemNodeGetVolumeStats() CephFS does not have a concept of "free inodes", inodes get allocated on-demand in the filesystem. This confuses alerting managers that expect a (high) number of free inodes, and warnings get produced if the number of free inodes is not high enough. This causes alerts to always get reported for CephFS. To prevent the false-positive alerts from happening, the NodeGetVolumeStats procedure for CephFS (and CephNFS) will not contain inodes in the reply anymore. See-also: https://bugzilla.redhat.com/2128263 Signed-off-by: Niels de Vos --- internal/cephfs/nodeserver.go | 2 +- internal/csi-common/utils.go | 53 +++++++++++++++------------ internal/csi-common/utils_test.go | 2 +- internal/nfs/nodeserver/nodeserver.go | 2 +- internal/rbd/nodeserver.go | 2 +- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/internal/cephfs/nodeserver.go b/internal/cephfs/nodeserver.go index cb540541f..629467d67 100644 --- a/internal/cephfs/nodeserver.go +++ b/internal/cephfs/nodeserver.go @@ -637,7 +637,7 @@ func (ns *NodeServer) NodeGetVolumeStats( } if stat.Mode().IsDir() { - return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath) + return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath, false) } return nil, status.Errorf(codes.InvalidArgument, "targetpath %q is not a directory or device", targetPath) diff --git a/internal/csi-common/utils.go b/internal/csi-common/utils.go index feb2386f8..a2eaa741f 100644 --- a/internal/csi-common/utils.go +++ b/internal/csi-common/utils.go @@ -241,6 +241,7 @@ func FilesystemNodeGetVolumeStats( ctx context.Context, mounter mount.Interface, targetPath string, + includeInodes bool, ) (*csi.NodeGetVolumeStatsResponse, error) { isMnt, err := util.IsMountPoint(mounter, targetPath) if err != nil { @@ -274,23 +275,8 @@ func FilesystemNodeGetVolumeStats( if !ok { log.ErrorLog(ctx, "failed to fetch used bytes") } - inodes, ok := (*(volMetrics.Inodes)).AsInt64() - if !ok { - log.ErrorLog(ctx, "failed to fetch available inodes") - return nil, status.Error(codes.Unknown, "failed to fetch available inodes") - } - inodesFree, ok := (*(volMetrics.InodesFree)).AsInt64() - if !ok { - log.ErrorLog(ctx, "failed to fetch free inodes") - } - - inodesUsed, ok := (*(volMetrics.InodesUsed)).AsInt64() - if !ok { - log.ErrorLog(ctx, "failed to fetch used inodes") - } - - return &csi.NodeGetVolumeStatsResponse{ + res := &csi.NodeGetVolumeStatsResponse{ Usage: []*csi.VolumeUsage{ { Available: requirePositive(available), @@ -298,14 +284,35 @@ func FilesystemNodeGetVolumeStats( Used: requirePositive(used), Unit: csi.VolumeUsage_BYTES, }, - { - Available: requirePositive(inodesFree), - Total: requirePositive(inodes), - Used: requirePositive(inodesUsed), - Unit: csi.VolumeUsage_INODES, - }, }, - }, nil + } + + if includeInodes { + inodes, ok := (*(volMetrics.Inodes)).AsInt64() + if !ok { + log.ErrorLog(ctx, "failed to fetch available inodes") + + return nil, status.Error(codes.Unknown, "failed to fetch available inodes") + } + inodesFree, ok := (*(volMetrics.InodesFree)).AsInt64() + if !ok { + log.ErrorLog(ctx, "failed to fetch free inodes") + } + + inodesUsed, ok := (*(volMetrics.InodesUsed)).AsInt64() + if !ok { + log.ErrorLog(ctx, "failed to fetch used inodes") + } + + res.Usage = append(res.Usage, &csi.VolumeUsage{ + Available: requirePositive(inodesFree), + Total: requirePositive(inodes), + Used: requirePositive(inodesUsed), + Unit: csi.VolumeUsage_INODES, + }) + } + + return res, nil } // requirePositive returns the value for `x` when it is greater or equal to 0, diff --git a/internal/csi-common/utils_test.go b/internal/csi-common/utils_test.go index 42958f74d..e5687986a 100644 --- a/internal/csi-common/utils_test.go +++ b/internal/csi-common/utils_test.go @@ -88,7 +88,7 @@ func TestFilesystemNodeGetVolumeStats(t *testing.T) { // retry until a mountpoint is found for { - stats, err := FilesystemNodeGetVolumeStats(context.TODO(), mount.New(""), cwd) + stats, err := FilesystemNodeGetVolumeStats(context.TODO(), mount.New(""), cwd, true) if err != nil && cwd != "/" && strings.HasSuffix(err.Error(), "is not mounted") { // try again with the parent directory cwd = filepath.Dir(cwd) diff --git a/internal/nfs/nodeserver/nodeserver.go b/internal/nfs/nodeserver/nodeserver.go index cb0c70275..c4e7ca845 100644 --- a/internal/nfs/nodeserver/nodeserver.go +++ b/internal/nfs/nodeserver/nodeserver.go @@ -182,7 +182,7 @@ func (ns *NodeServer) NodeGetVolumeStats( } if stat.Mode().IsDir() { - return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath) + return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath, false) } return nil, status.Errorf(codes.InvalidArgument, diff --git a/internal/rbd/nodeserver.go b/internal/rbd/nodeserver.go index 9c863920c..9b8c17fa3 100644 --- a/internal/rbd/nodeserver.go +++ b/internal/rbd/nodeserver.go @@ -1240,7 +1240,7 @@ func (ns *NodeServer) NodeGetVolumeStats( } if stat.Mode().IsDir() { - return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath) + return csicommon.FilesystemNodeGetVolumeStats(ctx, ns.Mounter, targetPath, true) } else if (stat.Mode() & os.ModeDevice) == os.ModeDevice { return blockNodeGetVolumeStats(ctx, targetPath) }