From 9b518726abc7bf46cdbf0f092eb87665f81fc3d4 Mon Sep 17 00:00:00 2001
From: Madhu Rajanna <madhupr007@gmail.com>
Date: Wed, 24 Jun 2020 12:14:02 +0530
Subject: [PATCH] rbd: add hardlimt and softlimit flag

added Hardlimit and Softlimit flags for cephcsi
arguments. When the Softlimit is reached cephcsi
will start a background task to flatten the rbd
image and return success and if the hardlimit
is reached it will start a background task
to flatten the rbd image and return ready
to use as false to make sure that the image
will not be used until it is flatten.

Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
---
 .../templates/provisioner-deployment.yaml     |  2 ++
 charts/ceph-csi-rbd/values.yaml               |  6 ++++
 cmd/cephcsi.go                                | 14 ++++++++
 .../kubernetes/csi-rbdplugin-provisioner.yaml |  2 ++
 docs/deploy-rbd.md                            | 36 ++++++++++---------
 internal/rbd/driver.go                        |  8 +++++
 internal/util/util.go                         |  5 +++
 7 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml b/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml
index 4442158d0..daf4bfb6d 100644
--- a/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml
+++ b/charts/ceph-csi-rbd/templates/provisioner-deployment.yaml
@@ -114,6 +114,8 @@ spec:
             - "--endpoint=$(CSI_ENDPOINT)"
             - "--v=5"
             - "--drivername=$(DRIVER_NAME)"
+            - "--rbdhardmaxclonedepth={{ .Values.provisioner.hardMaxCloneDepth }}"
+            - "--rbdsoftmaxclonedepth={{ .Values.provisioner.softMaxCloneDepth }}"
           env:
             - name: POD_IP
               valueFrom:
diff --git a/charts/ceph-csi-rbd/values.yaml b/charts/ceph-csi-rbd/values.yaml
index 6c5d2e8b6..7265820a0 100644
--- a/charts/ceph-csi-rbd/values.yaml
+++ b/charts/ceph-csi-rbd/values.yaml
@@ -107,6 +107,12 @@ provisioner:
   replicaCount: 3
   # Timeout for waiting for creation or deletion of a volume
   timeout: 60s
+  # Hard limit for maximum number of nested volume clones that are taken before
+  # a flatten occurs
+  hardMaxCloneDepth: 8
+  # Soft limit for maximum number of nested volume clones that are taken before
+  # a flatten occurs
+  softMaxCloneDepth: 4
 
   httpMetrics:
     # Metrics only available for cephcsi/cephcsi => 1.2.0
diff --git a/cmd/cephcsi.go b/cmd/cephcsi.go
index 005e4f04e..acae741c6 100644
--- a/cmd/cephcsi.go
+++ b/cmd/cephcsi.go
@@ -77,6 +77,8 @@ func init() {
 	flag.StringVar(&conf.HistogramOption, "histogramoption", "0.5,2,6",
 		"[DEPRECATED] Histogram option for grpc metrics, should be comma separated value, ex:= 0.5,2,6 where start=0.5 factor=2, count=6")
 
+	flag.UintVar(&conf.RbdHardMaxCloneDepth, "rbdhardmaxclonedepth", 8, "Hard limit for maximum number of nested volume clones that are taken before a flatten occurs")
+	flag.UintVar(&conf.RbdSoftMaxCloneDepth, "rbdsoftmaxclonedepth", 4, "Soft limit for maximum number of nested volume clones that are taken before a flatten occurs")
 	flag.BoolVar(&conf.Version, "version", false, "Print cephcsi version information")
 
 	klog.InitFlags(nil)
@@ -175,6 +177,7 @@ func main() {
 	klog.V(1).Infof("Starting driver type: %v with name: %v", conf.Vtype, dname)
 	switch conf.Vtype {
 	case rbdType:
+		validateCloneDepthFlag(&conf)
 		driver := rbd.NewDriver()
 		driver.Run(&conf, cp)
 
@@ -194,3 +197,14 @@ func main() {
 
 	os.Exit(0)
 }
+
+func validateCloneDepthFlag(conf *util.Config) {
+	// keeping hardlimit to 14 as max to avoid max image depth
+	if conf.RbdHardMaxCloneDepth == 0 || conf.RbdHardMaxCloneDepth > 14 {
+		klog.Fatalln("rbdhardmaxclonedepth flag value should be between 1 and 14")
+	}
+
+	if conf.RbdSoftMaxCloneDepth > conf.RbdHardMaxCloneDepth {
+		klog.Fatalln("rbdsoftmaxclonedepth flag value should not be greater than rbdhardmaxclonedepth")
+	}
+}
diff --git a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml
index 32a7caadd..fb90ff570 100644
--- a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml
+++ b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml
@@ -112,6 +112,8 @@ spec:
             - "--v=5"
             - "--drivername=rbd.csi.ceph.com"
             - "--pidlimit=-1"
+            - "--rbdhardmaxclonedepth=8"
+            - "--rbdsoftmaxclonedepth=4"
           env:
             - name: POD_IP
               valueFrom:
diff --git a/docs/deploy-rbd.md b/docs/deploy-rbd.md
index d8fa54ad8..493ad97b6 100644
--- a/docs/deploy-rbd.md
+++ b/docs/deploy-rbd.md
@@ -26,22 +26,24 @@ make image-cephcsi
 
 **Available command line arguments:**
 
-| Option                | Default value         | Description                                                                                                                                                                                                                                                                          |
-|-----------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `--endpoint`          | `unix://tmp/csi.sock` | CSI endpoint, must be a UNIX socket                                                                                                                                                                                                                                                  |
-| `--drivername`        | `rbd.csi.ceph.com`    | Name of the driver (Kubernetes: `provisioner` field in StorageClass must correspond to this value)                                                                                                                                                                                   |
-| `--nodeid`            | _empty_               | This node's ID                                                                                                                                                                                                                                                                       |
-| `--type`              | _empty_               | Driver type `[rbd | cephfs]` If the driver type is set to  `rbd` it will act as a `rbd plugin` or if it's set to `cephfs` will act as a `cephfs plugin`                                                                                                                              |
-| `--instanceid`        | "default"             | Unique ID distinguishing this instance of Ceph CSI among other instances, when sharing Ceph clusters across CSI instances for provisioning                                                                                                                                           |
-| `--metadatastorage`   | _empty_               | Points to where legacy (1.0.0 or older plugin versions) metadata about provisioned volumes are kept, as file or in as k8s configmap (`node` or `k8s_configmap` respectively)                                                                                                         |
-| `--pidlimit`          | _0_                   | Configure the PID limit in cgroups. The container runtime can restrict the number of processes/tasks which can cause problems while provisioning (or deleting) a large number of volumes. A value of `-1` configures the limit to the maximum, `0` does not configure limits at all. |
-| `--metricsport`       | `8080`                | TCP port for liveness metrics requests                                                                                                                                                                                                                                               |
-| `--metricspath`       | `"/metrics"`          | Path of prometheus endpoint where metrics will be available                                                                                                                                                                                                                          |
-| `--enablegrpcmetrics` | `false`               | [Deprecated] Enable grpc metrics collection  and start prometheus server                                                                                                                                                                                                             |
-| `--polltime`          | `"60s"`               | Time interval in between each poll                                                                                                                                                                                                                                                   |
-| `--timeout`           | `"3s"`                | Probe timeout in seconds                                                                                                                                                                                                                                                             |
-| `--histogramoption`   | `0.5,2,6`             | [Deprecated] Histogram option for grpc metrics, should be comma separated value (ex:= "0.5,2,6" where start=0.5 factor=2, count=6)                                                                                                                                                   |
-| `--domainlabels`      | _empty_               | Kubernetes node labels to use as CSI domain labels for topology aware provisioning, should be a comma separated value (ex:= "failure-domain/region,failure-domain/zone")                                                                                                             |
+| Option                   | Default value         | Description                                                                                                                                                                                                                                                                          |
+| ------------------------ | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `--endpoint`             | `unix://tmp/csi.sock` | CSI endpoint, must be a UNIX socket                                                                                                                                                                                                                                                  |
+| `--drivername`           | `rbd.csi.ceph.com`    | Name of the driver (Kubernetes: `provisioner` field in StorageClass must correspond to this value)                                                                                                                                                                                   |
+| `--nodeid`               | _empty_               | This node's ID                                                                                                                                                                                                                                                                       |
+| `--type`                 | _empty_               | Driver type `[rbd | cephfs]` If the driver type is set to  `rbd` it will act as a `rbd plugin` or if it's set to `cephfs` will act as a `cephfs plugin`                                                                                                                              |
+| `--instanceid`           | "default"             | Unique ID distinguishing this instance of Ceph CSI among other instances, when sharing Ceph clusters across CSI instances for provisioning                                                                                                                                           |
+| `--metadatastorage`      | _empty_               | Points to where legacy (1.0.0 or older plugin versions) metadata about provisioned volumes are kept, as file or in as k8s configmap (`node` or `k8s_configmap` respectively)                                                                                                         |
+| `--pidlimit`             | _0_                   | Configure the PID limit in cgroups. The container runtime can restrict the number of processes/tasks which can cause problems while provisioning (or deleting) a large number of volumes. A value of `-1` configures the limit to the maximum, `0` does not configure limits at all. |
+| `--metricsport`          | `8080`                | TCP port for liveness metrics requests                                                                                                                                                                                                                                               |
+| `--metricspath`          | `"/metrics"`          | Path of prometheus endpoint where metrics will be available                                                                                                                                                                                                                          |
+| `--enablegrpcmetrics`    | `false`               | [Deprecated] Enable grpc metrics collection  and start prometheus server                                                                                                                                                                                                             |
+| `--polltime`             | `"60s"`               | Time interval in between each poll                                                                                                                                                                                                                                                   |
+| `--timeout`              | `"3s"`                | Probe timeout in seconds                                                                                                                                                                                                                                                             |
+| `--histogramoption`      | `0.5,2,6`             | [Deprecated] Histogram option for grpc metrics, should be comma separated value (ex:= "0.5,2,6" where start=0.5 factor=2, count=6)                                                                                                                                                   |
+| `--domainlabels`         | _empty_               | Kubernetes node labels to use as CSI domain labels for topology aware provisioning, should be a comma separated value (ex:= "failure-domain/region,failure-domain/zone")                                                                                                             |
+| `--rbdhardmaxclonedepth` | `8`                   | Hard limit for maximum number of nested volume clones that are taken before a flatten occurs                                                                                                                                                                                         |
+| `--rbdsoftmaxclonedepth` | `4`                   | Soft limit for maximum number of nested volume clones that are taken before a flatten occurs                                                                                                                                                                                         |
 
 **Available volume parameters:**
 
@@ -50,7 +52,7 @@ make image-cephcsi
 | `clusterID`                                                                                         | yes                  | String representing a Ceph cluster, must be unique across all Ceph clusters in use for provisioning, cannot be greater than 36 bytes in length, and should remain immutable for the lifetime of the Ceph cluster in use |
 | `pool`                                                                                              | yes                  | Ceph pool into which the RBD image shall be created                                                                                                                                                                     |
 | `dataPool`                                                                                          | no                   | Ceph pool used for the data of the RBD images.                                                                                                                                                                          |
-| `volumeNamePrefix`                                                                                  | no                   | Prefix to use for naming RBD images (defaults to `csi-vol-`).                                                                                                                                                    |
+| `volumeNamePrefix`                                                                                  | no                   | Prefix to use for naming RBD images (defaults to `csi-vol-`).                                                                                                                                                           |
 | `snapshotNamePrefix`                                                                                | no                   | Prefix to use for naming RBD snapshot images (defaults to `csi-snap-`).                                                                                                                                                 |
 | `imageFeatures`                                                                                     | no                   | RBD image features. CSI RBD currently supports only `layering` feature. See [man pages](http://docs.ceph.com/docs/mimic/man/8/rbd/#cmdoption-rbd-image-feature)                                                         |
 | `csi.storage.k8s.io/provisioner-secret-name`, `csi.storage.k8s.io/node-stage-secret-name`           | yes (for Kubernetes) | name of the Kubernetes Secret object containing Ceph client credentials. Both parameters should have the same value                                                                                                     |
diff --git a/internal/rbd/driver.go b/internal/rbd/driver.go
index 7117f872b..db3bcf459 100644
--- a/internal/rbd/driver.go
+++ b/internal/rbd/driver.go
@@ -53,6 +53,11 @@ var (
 	// VolumeName to backing RBD images
 	volJournal  *journal.Config
 	snapJournal *journal.Config
+	// rbdHardMaxCloneDepth is the hard limit for maximum number of nested volume clones that are taken before a flatten occurs
+	rbdHardMaxCloneDepth uint
+
+	// rbdSoftMaxCloneDepth is the soft limit for maximum number of nested volume clones that are taken before a flatten occurs
+	rbdSoftMaxCloneDepth uint
 )
 
 // NewDriver returns new rbd driver
@@ -103,6 +108,9 @@ func (r *Driver) Run(conf *util.Config, cachePersister util.CachePersister) {
 		CSIInstanceID = conf.InstanceID
 	}
 
+	// update clone soft and hard limit
+	rbdHardMaxCloneDepth = conf.RbdHardMaxCloneDepth
+	rbdSoftMaxCloneDepth = conf.RbdSoftMaxCloneDepth
 	// Create instances of the volume and snapshot journal
 	volJournal = journal.NewCSIVolumeJournal(CSIInstanceID)
 	snapJournal = journal.NewCSISnapshotJournal(CSIInstanceID)
diff --git a/internal/util/util.go b/internal/util/util.go
index f2de91b38..ed105609d 100644
--- a/internal/util/util.go
+++ b/internal/util/util.go
@@ -98,6 +98,11 @@ type Config struct {
 	// cephfs related flags
 	ForceKernelCephFS bool // force to use the ceph kernel client even if the kernel is < 4.17
 
+	// RbdHardMaxCloneDepth is the hard limit for maximum number of nested volume clones that are taken before a flatten occurs
+	RbdHardMaxCloneDepth uint
+
+	// RbdSoftMaxCloneDepth is the soft limit for maximum number of nested volume clones that are taken before a flatten occurs
+	RbdSoftMaxCloneDepth uint
 }
 
 // CreatePersistanceStorage creates storage path and initializes new cache