mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-01-05 11:39:29 +00:00
a4e4750fdc
This commit disables mon,mgr and mds liveness probe which on failing caused `crashLoopBackOff` state. Updates: #2094 Signed-off-by: Rakshith R <rar@redhat.com>
259 lines
8.3 KiB
Bash
Executable File
259 lines
8.3 KiB
Bash
Executable File
#!/bin/bash -E
|
|
|
|
ROOK_VERSION=${ROOK_VERSION:-"v1.6.2"}
|
|
ROOK_DEPLOY_TIMEOUT=${ROOK_DEPLOY_TIMEOUT:-300}
|
|
ROOK_URL="https://raw.githubusercontent.com/rook/rook/${ROOK_VERSION}/cluster/examples/kubernetes/ceph"
|
|
ROOK_BLOCK_POOL_NAME=${ROOK_BLOCK_POOL_NAME:-"newrbdpool"}
|
|
KUBECTL_RETRY=5
|
|
KUBECTL_RETRY_DELAY=10
|
|
|
|
trap log_errors ERR
|
|
|
|
# log_errors is called on exit (see 'trap' above) and tries to provide
|
|
# sufficient information to debug deployment problems
|
|
function log_errors() {
|
|
# enable verbose execution
|
|
set -x
|
|
kubectl get nodes
|
|
kubectl -n rook-ceph get events
|
|
kubectl -n rook-ceph describe pods
|
|
kubectl -n rook-ceph logs -l app=rook-ceph-operator
|
|
kubectl -n rook-ceph get CephClusters -oyaml
|
|
kubectl -n rook-ceph get CephFilesystems -oyaml
|
|
kubectl -n rook-ceph get CephBlockPools -oyaml
|
|
|
|
# this function should not return, a fatal error was caught!
|
|
exit 1
|
|
}
|
|
|
|
rook_version() {
|
|
echo "${ROOK_VERSION#v}" | cut -d'.' -f"${1}"
|
|
}
|
|
|
|
kubectl_retry() {
|
|
local retries=0 action="${1}" ret=0 stdout stderr
|
|
shift
|
|
|
|
# temporary files for kubectl output
|
|
stdout=$(mktemp rook-kubectl-stdout.XXXXXXXX)
|
|
stderr=$(mktemp rook-kubectl-stderr.XXXXXXXX)
|
|
|
|
while ! kubectl "${action}" "${@}" 2>"${stderr}" 1>"${stdout}"
|
|
do
|
|
# in case of a failure when running "create", ignore errors with "AlreadyExists"
|
|
if [ "${action}" == 'create' ]
|
|
then
|
|
# count lines in stderr that do not have "AlreadyExists"
|
|
ret=$(grep -cvw 'AlreadyExists' "${stderr}")
|
|
if [ "${ret}" -eq 0 ]
|
|
then
|
|
# Success! stderr is empty after removing all "AlreadyExists" lines.
|
|
break
|
|
fi
|
|
fi
|
|
|
|
retries=$((retries+1))
|
|
if [ ${retries} -eq ${KUBECTL_RETRY} ]
|
|
then
|
|
ret=1
|
|
break
|
|
fi
|
|
|
|
# log stderr and empty the tmpfile
|
|
cat "${stderr}" > /dev/stderr
|
|
true > "${stderr}"
|
|
echo "kubectl_retry ${*} failed, will retry in ${KUBECTL_RETRY_DELAY} seconds"
|
|
|
|
sleep ${KUBECTL_RETRY_DELAY}
|
|
|
|
# reset ret so that a next working kubectl does not cause a non-zero
|
|
# return of the function
|
|
ret=0
|
|
done
|
|
|
|
# write output so that calling functions can consume it
|
|
cat "${stdout}" > /dev/stdout
|
|
cat "${stderr}" > /dev/stderr
|
|
|
|
rm -f "${stdout}" "${stderr}"
|
|
|
|
return ${ret}
|
|
}
|
|
|
|
function deploy_rook() {
|
|
kubectl_retry create -f "${ROOK_URL}/common.yaml"
|
|
|
|
# If rook version is > 1.5 , we will apply CRDs.
|
|
ROOK_MAJOR=$(rook_version 1)
|
|
ROOK_MINOR=$(rook_version 2)
|
|
if [ "${ROOK_MAJOR}" -eq 1 ] && [ "${ROOK_MINOR}" -ge 5 ];
|
|
then
|
|
kubectl_retry create -f "${ROOK_URL}/crds.yaml"
|
|
fi
|
|
kubectl_retry create -f "${ROOK_URL}/operator.yaml"
|
|
# Override the ceph version which rook installs by default.
|
|
if [ -z "${ROOK_CEPH_CLUSTER_IMAGE}" ]
|
|
then
|
|
kubectl_retry create -f "${ROOK_URL}/cluster-test.yaml"
|
|
else
|
|
ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH="image: ${ROOK_CEPH_CLUSTER_IMAGE}"
|
|
TEMP_DIR="$(mktemp -d)"
|
|
curl -o "${TEMP_DIR}"/cluster-test.yaml "${ROOK_URL}/cluster-test.yaml"
|
|
sed -i "s|image.*|${ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH}|g" "${TEMP_DIR}"/cluster-test.yaml
|
|
sed -i "s/config: |/config: |\n \[mon\]\n mon_warn_on_insecure_global_id_reclaim_allowed = false/g" "${TEMP_DIR}"/cluster-test.yaml
|
|
sed -i "s/healthCheck:/healthCheck:\n livenessProbe:\n mon:\n disabled: true\n mgr:\n disabled: true\n mds:\n disabled: true/g" "${TEMP_DIR}"/cluster-test.yaml
|
|
cat "${TEMP_DIR}"/cluster-test.yaml
|
|
kubectl_retry create -f "${TEMP_DIR}/cluster-test.yaml"
|
|
rm -rf "${TEMP_DIR}"
|
|
fi
|
|
|
|
kubectl_retry create -f "${ROOK_URL}/toolbox.yaml"
|
|
kubectl_retry create -f "${ROOK_URL}/filesystem-test.yaml"
|
|
kubectl_retry create -f "${ROOK_URL}/pool-test.yaml"
|
|
|
|
# Check if CephCluster is empty
|
|
if ! kubectl_retry -n rook-ceph get cephclusters -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_ceph_cluster_health
|
|
fi
|
|
|
|
# Check if CephFileSystem is empty
|
|
if ! kubectl_retry -n rook-ceph get cephfilesystems -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_mds_stat
|
|
fi
|
|
|
|
# Check if CephBlockPool is empty
|
|
if ! kubectl_retry -n rook-ceph get cephblockpools -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_rbd_stat ""
|
|
fi
|
|
}
|
|
|
|
function teardown_rook() {
|
|
kubectl delete -f "${ROOK_URL}/pool-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/filesystem-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/toolbox.yaml"
|
|
kubectl delete -f "${ROOK_URL}/cluster-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/operator.yaml"
|
|
ROOK_MAJOR=$(rook_version 1)
|
|
ROOK_MINOR=$(rook_version 2)
|
|
if [ "${ROOK_MAJOR}" -eq 1 ] && [ "${ROOK_MINOR}" -ge 5 ];
|
|
then
|
|
kubectl delete -f "${ROOK_URL}/crds.yaml"
|
|
fi
|
|
kubectl delete -f "${ROOK_URL}/common.yaml"
|
|
}
|
|
|
|
function create_block_pool() {
|
|
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
|
|
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
|
|
kubectl_retry create -f "./newpool.yaml"
|
|
rm -f "./newpool.yaml"
|
|
|
|
check_rbd_stat "$ROOK_BLOCK_POOL_NAME"
|
|
}
|
|
|
|
function delete_block_pool() {
|
|
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
|
|
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
|
|
kubectl delete -f "./newpool.yaml"
|
|
rm -f "./newpool.yaml"
|
|
}
|
|
|
|
function check_ceph_cluster_health() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
echo "Wait for rook deploy... ${retry}s" && sleep 5
|
|
|
|
CEPH_STATE=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.state}')
|
|
CEPH_HEALTH=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.ceph.health}')
|
|
echo "Checking CEPH cluster state: [$CEPH_STATE]"
|
|
if [ "$CEPH_STATE" = "Created" ]; then
|
|
if [ "$CEPH_HEALTH" = "HEALTH_OK" ]; then
|
|
echo "Creating CEPH cluster is done. [$CEPH_HEALTH]"
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] CEPH cluster not in a healthy state (timeout)"
|
|
return 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
function check_mds_stat() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
FS_NAME=$(kubectl_retry -n rook-ceph get cephfilesystems.ceph.rook.io -ojsonpath='{.items[0].metadata.name}')
|
|
echo "Checking MDS ($FS_NAME) stats... ${retry}s" && sleep 5
|
|
|
|
ACTIVE_COUNT=$(kubectl_retry -n rook-ceph get cephfilesystems myfs -ojsonpath='{.spec.metadataServer.activeCount}')
|
|
|
|
ACTIVE_COUNT_NUM=$((ACTIVE_COUNT + 0))
|
|
echo "MDS ($FS_NAME) active_count: [$ACTIVE_COUNT_NUM]"
|
|
if ((ACTIVE_COUNT_NUM < 1)); then
|
|
continue
|
|
else
|
|
if kubectl_retry -n rook-ceph get pod -l rook_file_system=myfs | grep Running &>/dev/null; then
|
|
echo "Filesystem ($FS_NAME) is successfully created..."
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] Failed to get ceph filesystem pods"
|
|
return 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
function check_rbd_stat() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
if [ -z "$1" ]; then
|
|
RBD_POOL_NAME=$(kubectl_retry -n rook-ceph get cephblockpools -ojsonpath='{.items[0].metadata.name}')
|
|
else
|
|
RBD_POOL_NAME=$1
|
|
fi
|
|
echo "Checking RBD ($RBD_POOL_NAME) stats... ${retry}s" && sleep 5
|
|
|
|
TOOLBOX_POD=$(kubectl_retry -n rook-ceph get pods -l app=rook-ceph-tools -o jsonpath='{.items[0].metadata.name}')
|
|
TOOLBOX_POD_STATUS=$(kubectl_retry -n rook-ceph get pod "$TOOLBOX_POD" -ojsonpath='{.status.phase}')
|
|
[[ "$TOOLBOX_POD_STATUS" != "Running" ]] && \
|
|
{ echo "Toolbox POD ($TOOLBOX_POD) status: [$TOOLBOX_POD_STATUS]"; continue; }
|
|
|
|
if kubectl_retry exec -n rook-ceph "$TOOLBOX_POD" -it -- rbd pool stats "$RBD_POOL_NAME" &>/dev/null; then
|
|
echo "RBD ($RBD_POOL_NAME) is successfully created..."
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] Failed to get RBD pool stats"
|
|
return 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
case "${1:-}" in
|
|
deploy)
|
|
deploy_rook
|
|
;;
|
|
teardown)
|
|
teardown_rook
|
|
;;
|
|
create-block-pool)
|
|
create_block_pool
|
|
;;
|
|
delete-block-pool)
|
|
delete_block_pool
|
|
;;
|
|
*)
|
|
echo " $0 [command]
|
|
Available Commands:
|
|
deploy Deploy a rook
|
|
teardown Teardown a rook
|
|
create-block-pool Create a rook block pool
|
|
delete-block-pool Delete a rook block pool
|
|
" >&2
|
|
;;
|
|
esac
|