ceph-csi/scripts/rook.sh

259 lines
8.3 KiB
Bash
Raw Normal View History

#!/bin/bash -E
ROOK_VERSION=${ROOK_VERSION:-"v1.6.2"}
ROOK_DEPLOY_TIMEOUT=${ROOK_DEPLOY_TIMEOUT:-300}
ROOK_URL="https://raw.githubusercontent.com/rook/rook/${ROOK_VERSION}/cluster/examples/kubernetes/ceph"
ROOK_BLOCK_POOL_NAME=${ROOK_BLOCK_POOL_NAME:-"newrbdpool"}
KUBECTL_RETRY=5
KUBECTL_RETRY_DELAY=10
trap log_errors ERR
# log_errors is called on exit (see 'trap' above) and tries to provide
# sufficient information to debug deployment problems
function log_errors() {
# enable verbose execution
set -x
kubectl get nodes
kubectl -n rook-ceph get events
kubectl -n rook-ceph describe pods
kubectl -n rook-ceph logs -l app=rook-ceph-operator
kubectl -n rook-ceph get CephClusters -oyaml
kubectl -n rook-ceph get CephFilesystems -oyaml
kubectl -n rook-ceph get CephBlockPools -oyaml
# this function should not return, a fatal error was caught!
exit 1
}
rook_version() {
echo "${ROOK_VERSION#v}" | cut -d'.' -f"${1}"
}
kubectl_retry() {
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
local retries=0 action="${1}" ret=0 stdout stderr
shift
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
# temporary files for kubectl output
stdout=$(mktemp rook-kubectl-stdout.XXXXXXXX)
stderr=$(mktemp rook-kubectl-stderr.XXXXXXXX)
while ! kubectl "${action}" "${@}" 2>"${stderr}" 1>"${stdout}"
do
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
# in case of a failure when running "create", ignore errors with "AlreadyExists"
if [ "${action}" == 'create' ]
then
# count lines in stderr that do not have "AlreadyExists"
ret=$(grep -cvw 'AlreadyExists' "${stderr}")
if [ "${ret}" -eq 0 ]
then
# Success! stderr is empty after removing all "AlreadyExists" lines.
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
break
fi
fi
retries=$((retries+1))
if [ ${retries} -eq ${KUBECTL_RETRY} ]
then
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
ret=1
break
fi
# log stderr and empty the tmpfile
cat "${stderr}" > /dev/stderr
true > "${stderr}"
echo "kubectl_retry ${*} failed, will retry in ${KUBECTL_RETRY_DELAY} seconds"
sleep ${KUBECTL_RETRY_DELAY}
# reset ret so that a next working kubectl does not cause a non-zero
# return of the function
ret=0
done
ci: only retry "kubectl create" if objects are missing There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
2020-08-11 09:13:29 +00:00
# write output so that calling functions can consume it
cat "${stdout}" > /dev/stdout
cat "${stderr}" > /dev/stderr
rm -f "${stdout}" "${stderr}"
return ${ret}
}
function deploy_rook() {
kubectl_retry create -f "${ROOK_URL}/common.yaml"
# If rook version is > 1.5 , we will apply CRDs.
ROOK_MAJOR=$(rook_version 1)
ROOK_MINOR=$(rook_version 2)
if [ "${ROOK_MAJOR}" -eq 1 ] && [ "${ROOK_MINOR}" -ge 5 ];
then
kubectl_retry create -f "${ROOK_URL}/crds.yaml"
fi
kubectl_retry create -f "${ROOK_URL}/operator.yaml"
# Override the ceph version which rook installs by default.
if [ -z "${ROOK_CEPH_CLUSTER_IMAGE}" ]
then
kubectl_retry create -f "${ROOK_URL}/cluster-test.yaml"
else
ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH="image: ${ROOK_CEPH_CLUSTER_IMAGE}"
TEMP_DIR="$(mktemp -d)"
curl -o "${TEMP_DIR}"/cluster-test.yaml "${ROOK_URL}/cluster-test.yaml"
sed -i "s|image.*|${ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH}|g" "${TEMP_DIR}"/cluster-test.yaml
sed -i "s/config: |/config: |\n \[mon\]\n mon_warn_on_insecure_global_id_reclaim_allowed = false/g" "${TEMP_DIR}"/cluster-test.yaml
sed -i "s/healthCheck:/healthCheck:\n livenessProbe:\n mon:\n disabled: true\n mgr:\n disabled: true\n mds:\n disabled: true/g" "${TEMP_DIR}"/cluster-test.yaml
cat "${TEMP_DIR}"/cluster-test.yaml
kubectl_retry create -f "${TEMP_DIR}/cluster-test.yaml"
rm -rf "${TEMP_DIR}"
fi
kubectl_retry create -f "${ROOK_URL}/toolbox.yaml"
kubectl_retry create -f "${ROOK_URL}/filesystem-test.yaml"
kubectl_retry create -f "${ROOK_URL}/pool-test.yaml"
# Check if CephCluster is empty
if ! kubectl_retry -n rook-ceph get cephclusters -oyaml | grep 'items: \[\]' &>/dev/null; then
check_ceph_cluster_health
fi
# Check if CephFileSystem is empty
if ! kubectl_retry -n rook-ceph get cephfilesystems -oyaml | grep 'items: \[\]' &>/dev/null; then
check_mds_stat
fi
# Check if CephBlockPool is empty
if ! kubectl_retry -n rook-ceph get cephblockpools -oyaml | grep 'items: \[\]' &>/dev/null; then
check_rbd_stat ""
fi
}
function teardown_rook() {
kubectl delete -f "${ROOK_URL}/pool-test.yaml"
kubectl delete -f "${ROOK_URL}/filesystem-test.yaml"
kubectl delete -f "${ROOK_URL}/toolbox.yaml"
kubectl delete -f "${ROOK_URL}/cluster-test.yaml"
kubectl delete -f "${ROOK_URL}/operator.yaml"
ROOK_MAJOR=$(rook_version 1)
ROOK_MINOR=$(rook_version 2)
if [ "${ROOK_MAJOR}" -eq 1 ] && [ "${ROOK_MINOR}" -ge 5 ];
then
kubectl delete -f "${ROOK_URL}/crds.yaml"
fi
kubectl delete -f "${ROOK_URL}/common.yaml"
}
function create_block_pool() {
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
kubectl_retry create -f "./newpool.yaml"
rm -f "./newpool.yaml"
check_rbd_stat "$ROOK_BLOCK_POOL_NAME"
}
function delete_block_pool() {
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
kubectl delete -f "./newpool.yaml"
rm -f "./newpool.yaml"
}
function check_ceph_cluster_health() {
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
echo "Wait for rook deploy... ${retry}s" && sleep 5
CEPH_STATE=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.state}')
CEPH_HEALTH=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.ceph.health}')
echo "Checking CEPH cluster state: [$CEPH_STATE]"
if [ "$CEPH_STATE" = "Created" ]; then
if [ "$CEPH_HEALTH" = "HEALTH_OK" ]; then
echo "Creating CEPH cluster is done. [$CEPH_HEALTH]"
break
fi
fi
done
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
echo "[Timeout] CEPH cluster not in a healthy state (timeout)"
return 1
fi
echo ""
}
function check_mds_stat() {
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
FS_NAME=$(kubectl_retry -n rook-ceph get cephfilesystems.ceph.rook.io -ojsonpath='{.items[0].metadata.name}')
echo "Checking MDS ($FS_NAME) stats... ${retry}s" && sleep 5
ACTIVE_COUNT=$(kubectl_retry -n rook-ceph get cephfilesystems myfs -ojsonpath='{.spec.metadataServer.activeCount}')
ACTIVE_COUNT_NUM=$((ACTIVE_COUNT + 0))
echo "MDS ($FS_NAME) active_count: [$ACTIVE_COUNT_NUM]"
if ((ACTIVE_COUNT_NUM < 1)); then
continue
else
if kubectl_retry -n rook-ceph get pod -l rook_file_system=myfs | grep Running &>/dev/null; then
echo "Filesystem ($FS_NAME) is successfully created..."
break
fi
fi
done
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
echo "[Timeout] Failed to get ceph filesystem pods"
return 1
fi
echo ""
}
function check_rbd_stat() {
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
if [ -z "$1" ]; then
RBD_POOL_NAME=$(kubectl_retry -n rook-ceph get cephblockpools -ojsonpath='{.items[0].metadata.name}')
else
RBD_POOL_NAME=$1
fi
echo "Checking RBD ($RBD_POOL_NAME) stats... ${retry}s" && sleep 5
TOOLBOX_POD=$(kubectl_retry -n rook-ceph get pods -l app=rook-ceph-tools -o jsonpath='{.items[0].metadata.name}')
TOOLBOX_POD_STATUS=$(kubectl_retry -n rook-ceph get pod "$TOOLBOX_POD" -ojsonpath='{.status.phase}')
[[ "$TOOLBOX_POD_STATUS" != "Running" ]] && \
{ echo "Toolbox POD ($TOOLBOX_POD) status: [$TOOLBOX_POD_STATUS]"; continue; }
if kubectl_retry exec -n rook-ceph "$TOOLBOX_POD" -it -- rbd pool stats "$RBD_POOL_NAME" &>/dev/null; then
echo "RBD ($RBD_POOL_NAME) is successfully created..."
break
fi
done
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
echo "[Timeout] Failed to get RBD pool stats"
return 1
fi
echo ""
}
case "${1:-}" in
deploy)
deploy_rook
;;
teardown)
teardown_rook
;;
create-block-pool)
create_block_pool
;;
delete-block-pool)
delete_block_pool
;;
*)
echo " $0 [command]
Available Commands:
deploy Deploy a rook
teardown Teardown a rook
create-block-pool Create a rook block pool
delete-block-pool Delete a rook block pool
" >&2
;;
esac