mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-01-25 14:19:29 +00:00
c0fbaf4276
There can be spurious failures in the CI when running kubectl create. On occasion, the command returns with an error, but the api-server did receive and process the request. This causes a 2nd create action to fail with messages like: cephcluster.ceph.rook.io/my-cluster created Error from server: error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": etcdserver: request timed out Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": configmaps "rook-config-override" already exists Error from server (AlreadyExists): error when creating "/tmp/tmp.Ur1ZPG85o9/cluster-test.yaml": cephclusters.ceph.rook.io "my-cluster" already exists By handling the create action differently, and checking for the AlreadyExists word in the stderr output, it is possible to detect repeated creates that are not needed. Signed-off-by: Niels de Vos <ndevos@redhat.com>
225 lines
7.1 KiB
Bash
Executable File
225 lines
7.1 KiB
Bash
Executable File
#!/bin/bash -e
|
|
|
|
ROOK_VERSION=${ROOK_VERSION:-"v1.2.7"}
|
|
ROOK_DEPLOY_TIMEOUT=${ROOK_DEPLOY_TIMEOUT:-300}
|
|
ROOK_URL="https://raw.githubusercontent.com/rook/rook/${ROOK_VERSION}/cluster/examples/kubernetes/ceph"
|
|
ROOK_BLOCK_POOL_NAME=${ROOK_BLOCK_POOL_NAME:-"newrbdpool"}
|
|
ROOK_CEPH_CLUSTER_VERSION="v14.2.10"
|
|
KUBECTL_RETRY=5
|
|
KUBECTL_RETRY_DELAY=10
|
|
|
|
rook_version() {
|
|
echo "${ROOK_VERSION#v}" | cut -d'.' -f"${1}"
|
|
}
|
|
|
|
kubectl_retry() {
|
|
local retries=0 action="${1}" ret=0 stdout stderr
|
|
shift
|
|
|
|
# temporary files for kubectl output
|
|
stdout=$(mktemp rook-kubectl-stdout.XXXXXXXX)
|
|
stderr=$(mktemp rook-kubectl-stderr.XXXXXXXX)
|
|
|
|
while ! kubectl "${action}" "${@}" 2>"${stderr}" 1>"${stdout}"
|
|
do
|
|
# in case of a failure when running "create", ignore errors with "AlreadyExists"
|
|
if [ "${action}" == 'create' ]
|
|
then
|
|
# count lines in stderr that do not have "AlreadyExists"
|
|
ret=$(grep -cvw 'AlreadyExists' "${stderr}")
|
|
if [ "${ret}" -eq 0 ]
|
|
then
|
|
# Succes! stderr is empty after removing all "AlreadyExists" lines.
|
|
break
|
|
fi
|
|
fi
|
|
|
|
retries=$((retries+1))
|
|
if [ ${retries} -eq ${KUBECTL_RETRY} ]
|
|
then
|
|
ret=1
|
|
break
|
|
fi
|
|
sleep ${KUBECTL_RETRY_DELAY}
|
|
done
|
|
|
|
# write output so that calling functions can consume it
|
|
cat "${stdout}" > /dev/stdout
|
|
cat "${stderr}" > /dev/stderr
|
|
|
|
rm -f "${stdout}" "${stderr}"
|
|
|
|
return ${ret}
|
|
}
|
|
|
|
function deploy_rook() {
|
|
kubectl_retry create -f "${ROOK_URL}/common.yaml"
|
|
kubectl_retry create -f "${ROOK_URL}/operator.yaml"
|
|
# find out the rook version to decide on the ceph cluster image to be used
|
|
ROOK_MAJOR=$(rook_version 1)
|
|
ROOK_MINOR=$(rook_version 2)
|
|
if { [ "${ROOK_MAJOR}" -eq 1 ] && [ "${ROOK_MINOR}" -le 2 ]; }; then
|
|
ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH="image: ceph/ceph:${ROOK_CEPH_CLUSTER_VERSION}"
|
|
# upgrade ceph cluster version to 14.2.10 to support CephFS snapshot functionalities.
|
|
TEMP_DIR="$(mktemp -d)"
|
|
curl -o "${TEMP_DIR}"/cluster-test.yaml "${ROOK_URL}/cluster-test.yaml"
|
|
sed -i "s|image.*|${ROOK_CEPH_CLUSTER_VERSION_IMAGE_PATH}|g" "${TEMP_DIR}"/cluster-test.yaml
|
|
cat "${TEMP_DIR}"/cluster-test.yaml
|
|
kubectl_retry create -f "${TEMP_DIR}/cluster-test.yaml"
|
|
rm -rf "${TEMP_DIR}"
|
|
else
|
|
# add "mon_warn_on_pool_no_redundancy = false" to ceph.conf if missing
|
|
# see https://github.com/rook/rook/pull/5925 for upstream status
|
|
TEMP_DIR="$(mktemp -d)"
|
|
curl -o "${TEMP_DIR}"/cluster-test.yaml "${ROOK_URL}/cluster-test.yaml"
|
|
if ! grep -q mon_warn_on_pool_no_redundancy "${TEMP_DIR}"/cluster-test.yaml; then
|
|
sed -i '/osd_pool_default_size =/a \ mon_warn_on_pool_no_redundancy = false' "${TEMP_DIR}"/cluster-test.yaml
|
|
fi
|
|
kubectl_retry create -f "${TEMP_DIR}/cluster-test.yaml"
|
|
rm -rf "${TEMP_DIR}"
|
|
fi
|
|
|
|
kubectl_retry create -f "${ROOK_URL}/toolbox.yaml"
|
|
kubectl_retry create -f "${ROOK_URL}/filesystem-test.yaml"
|
|
kubectl_retry create -f "${ROOK_URL}/pool-test.yaml"
|
|
|
|
# Check if CephCluster is empty
|
|
if ! kubectl_retry -n rook-ceph get cephclusters -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_ceph_cluster_health
|
|
fi
|
|
|
|
# Check if CephFileSystem is empty
|
|
if ! kubectl_retry -n rook-ceph get cephfilesystems -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_mds_stat
|
|
fi
|
|
|
|
# Check if CephBlockPool is empty
|
|
if ! kubectl_retry -n rook-ceph get cephblockpools -oyaml | grep 'items: \[\]' &>/dev/null; then
|
|
check_rbd_stat ""
|
|
fi
|
|
}
|
|
|
|
function teardown_rook() {
|
|
kubectl delete -f "${ROOK_URL}/pool-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/filesystem-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/toolbox.yaml"
|
|
kubectl delete -f "${ROOK_URL}/cluster-test.yaml"
|
|
kubectl delete -f "${ROOK_URL}/operator.yaml"
|
|
kubectl delete -f "${ROOK_URL}/common.yaml"
|
|
}
|
|
|
|
function create_block_pool() {
|
|
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
|
|
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
|
|
kubectl_retry create -f "./newpool.yaml"
|
|
rm -f "./newpool.yaml"
|
|
|
|
check_rbd_stat "$ROOK_BLOCK_POOL_NAME"
|
|
}
|
|
|
|
function delete_block_pool() {
|
|
curl -o newpool.yaml "${ROOK_URL}/pool-test.yaml"
|
|
sed -i "s/replicapool/$ROOK_BLOCK_POOL_NAME/g" newpool.yaml
|
|
kubectl delete -f "./newpool.yaml"
|
|
rm -f "./newpool.yaml"
|
|
}
|
|
|
|
function check_ceph_cluster_health() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
echo "Wait for rook deploy... ${retry}s" && sleep 5
|
|
|
|
CEPH_STATE=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.state}')
|
|
CEPH_HEALTH=$(kubectl_retry -n rook-ceph get cephclusters -o jsonpath='{.items[0].status.ceph.health}')
|
|
echo "Checking CEPH cluster state: [$CEPH_STATE]"
|
|
if [ "$CEPH_STATE" = "Created" ]; then
|
|
if [ "$CEPH_HEALTH" = "HEALTH_OK" ]; then
|
|
echo "Creating CEPH cluster is done. [$CEPH_HEALTH]"
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] CEPH cluster not in a healthy state (timeout)"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
function check_mds_stat() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
FS_NAME=$(kubectl_retry -n rook-ceph get cephfilesystems.ceph.rook.io -ojsonpath='{.items[0].metadata.name}')
|
|
echo "Checking MDS ($FS_NAME) stats... ${retry}s" && sleep 5
|
|
|
|
ACTIVE_COUNT=$(kubectl_retry -n rook-ceph get cephfilesystems myfs -ojsonpath='{.spec.metadataServer.activeCount}')
|
|
|
|
ACTIVE_COUNT_NUM=$((ACTIVE_COUNT + 0))
|
|
echo "MDS ($FS_NAME) active_count: [$ACTIVE_COUNT_NUM]"
|
|
if ((ACTIVE_COUNT_NUM < 1)); then
|
|
continue
|
|
else
|
|
if kubectl_retry -n rook-ceph get pod -l rook_file_system=myfs | grep Running &>/dev/null; then
|
|
echo "Filesystem ($FS_NAME) is successfully created..."
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] Failed to get ceph filesystem pods"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
function check_rbd_stat() {
|
|
for ((retry = 0; retry <= ROOK_DEPLOY_TIMEOUT; retry = retry + 5)); do
|
|
if [ -z "$1" ]; then
|
|
RBD_POOL_NAME=$(kubectl_retry -n rook-ceph get cephblockpools -ojsonpath='{.items[0].metadata.name}')
|
|
else
|
|
RBD_POOL_NAME=$1
|
|
fi
|
|
echo "Checking RBD ($RBD_POOL_NAME) stats... ${retry}s" && sleep 5
|
|
|
|
TOOLBOX_POD=$(kubectl_retry -n rook-ceph get pods -l app=rook-ceph-tools -o jsonpath='{.items[0].metadata.name}')
|
|
TOOLBOX_POD_STATUS=$(kubectl_retry -n rook-ceph get pod "$TOOLBOX_POD" -ojsonpath='{.status.phase}')
|
|
[[ "$TOOLBOX_POD_STATUS" != "Running" ]] && \
|
|
{ echo "Toolbox POD ($TOOLBOX_POD) status: [$TOOLBOX_POD_STATUS]"; continue; }
|
|
|
|
if kubectl_retry exec -n rook-ceph "$TOOLBOX_POD" -it -- rbd pool stats "$RBD_POOL_NAME" &>/dev/null; then
|
|
echo "RBD ($RBD_POOL_NAME) is successfully created..."
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ "$retry" -gt "$ROOK_DEPLOY_TIMEOUT" ]; then
|
|
echo "[Timeout] Failed to get RBD pool stats"
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
}
|
|
|
|
case "${1:-}" in
|
|
deploy)
|
|
deploy_rook
|
|
;;
|
|
teardown)
|
|
teardown_rook
|
|
;;
|
|
create-block-pool)
|
|
create_block_pool
|
|
;;
|
|
delete-block-pool)
|
|
delete_block_pool
|
|
;;
|
|
*)
|
|
echo " $0 [command]
|
|
Available Commands:
|
|
deploy Deploy a rook
|
|
teardown Teardown a rook
|
|
create-block-pool Create a rook block pool
|
|
delete-block-pool Delete a rook block pool
|
|
" >&2
|
|
;;
|
|
esac
|