From 57d3183cb132215265793c47123bf87d65e3f841 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 17 Jun 2021 11:27:11 +0200 Subject: [PATCH] rbd: restart thick-provisioned PVC cloning after aborting In case cloning a thick-PVC failed during DeepCopy(), the image will exist, but have partial contents. Only when the image has the thick-provisioned metadata set, it has completed DeepCopy(). When the metadata is missing, the image is deleted, and an error is returned to the caller. Kubernetes will automatically retry provisioning on the ABORTED error, and the cloning will get restarted from the beginning. Signed-off-by: Niels de Vos --- internal/rbd/clone.go | 2 -- internal/rbd/controllerserver.go | 28 ++++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/internal/rbd/clone.go b/internal/rbd/clone.go index 88d987468..d7e77ba08 100644 --- a/internal/rbd/clone.go +++ b/internal/rbd/clone.go @@ -152,8 +152,6 @@ func (rv *rbdVolume) createCloneFromImage(ctx context.Context, parentVol *rbdVol } defer j.Destroy() - // TODO: if rv exists, delete the image and start over? - err = rv.doSnapClone(ctx, parentVol) if err != nil { return status.Error(codes.Internal, err.Error()) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 9c6481744..b20c439de 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -272,7 +272,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol if err != nil { return nil, getGRPCErrorForCreateVolume(err) } else if found { - return cs.repairExistingVolume(ctx, req, cr, rbdVol, rbdSnap) + return cs.repairExistingVolume(ctx, req, cr, rbdVol, parentVol, rbdSnap) } err = checkValidCreateVolumeRequest(rbdVol, parentVol, rbdSnap, cr) @@ -335,7 +335,7 @@ func flattenParentImage(ctx context.Context, rbdVol *rbdVolume, cr *util.Credent // that the state is corrected to what was requested. It is needed to call this // when the process of creating a volume was interrupted. func (cs *ControllerServer) repairExistingVolume(ctx context.Context, req *csi.CreateVolumeRequest, - cr *util.Credentials, rbdVol *rbdVolume, rbdSnap *rbdSnapshot) (*csi.CreateVolumeResponse, error) { + cr *util.Credentials, rbdVol, parentVol *rbdVolume, rbdSnap *rbdSnapshot) (*csi.CreateVolumeResponse, error) { vcs := req.GetVolumeContentSource() switch { @@ -364,9 +364,29 @@ func (cs *ControllerServer) repairExistingVolume(ctx context.Context, req *csi.C return nil, err } - // rbdVol is a clone from an other volume + // rbdVol is a clone from parentVol case vcs.GetVolume() != nil: - // TODO: is there really nothing to repair on image clone? + // When cloning into a thick-provisioned volume was happening, + // the image should be marked as thick-provisioned, unless it + // was aborted in flight. In order to restart the + // thick-cloning, delete the volume and let the caller retry + // from the start. + if isThickProvisionRequest(req.GetParameters()) { + thick, err := rbdVol.isThickProvisioned() + if err != nil { + return nil, status.Errorf(codes.Aborted, "failed to verify thick-provisioned volume %q: %s", rbdVol, err) + } else if !thick { + err = cleanUpSnapshot(ctx, parentVol, rbdSnap, rbdVol, cr) + if err != nil { + return nil, status.Errorf(codes.Aborted, "failed to remove partially cloned volume %q: %s", rbdVol, err) + } + err = undoVolReservation(ctx, rbdVol, cr) + if err != nil { + return nil, status.Errorf(codes.Aborted, "failed to remove volume %q from journal: %s", rbdVol, err) + } + return nil, status.Errorf(codes.Aborted, "cloning thick-provisioned volume %q has been interrupted, please retry", rbdVol) + } + } } return buildCreateVolumeResponse(req, rbdVol), nil