rbd: restart thick-provisioned PVC cloning after aborting

In case cloning a thick-PVC failed during DeepCopy(), the image will
exist, but have partial contents. Only when the image has the
thick-provisioned metadata set, it has completed DeepCopy().

When the metadata is missing, the image is deleted, and an error is
returned to the caller. Kubernetes will automatically retry provisioning
on the ABORTED error, and the cloning will get restarted from the
beginning.

Signed-off-by: Niels de Vos <ndevos@redhat.com>
This commit is contained in:
Niels de Vos 2021-06-17 11:27:11 +02:00 committed by mergify[bot]
parent b1045364d9
commit 57d3183cb1
2 changed files with 24 additions and 6 deletions

View File

@ -152,8 +152,6 @@ func (rv *rbdVolume) createCloneFromImage(ctx context.Context, parentVol *rbdVol
} }
defer j.Destroy() defer j.Destroy()
// TODO: if rv exists, delete the image and start over?
err = rv.doSnapClone(ctx, parentVol) err = rv.doSnapClone(ctx, parentVol)
if err != nil { if err != nil {
return status.Error(codes.Internal, err.Error()) return status.Error(codes.Internal, err.Error())

View File

@ -272,7 +272,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
if err != nil { if err != nil {
return nil, getGRPCErrorForCreateVolume(err) return nil, getGRPCErrorForCreateVolume(err)
} else if found { } else if found {
return cs.repairExistingVolume(ctx, req, cr, rbdVol, rbdSnap) return cs.repairExistingVolume(ctx, req, cr, rbdVol, parentVol, rbdSnap)
} }
err = checkValidCreateVolumeRequest(rbdVol, parentVol, rbdSnap, cr) err = checkValidCreateVolumeRequest(rbdVol, parentVol, rbdSnap, cr)
@ -335,7 +335,7 @@ func flattenParentImage(ctx context.Context, rbdVol *rbdVolume, cr *util.Credent
// that the state is corrected to what was requested. It is needed to call this // that the state is corrected to what was requested. It is needed to call this
// when the process of creating a volume was interrupted. // when the process of creating a volume was interrupted.
func (cs *ControllerServer) repairExistingVolume(ctx context.Context, req *csi.CreateVolumeRequest, func (cs *ControllerServer) repairExistingVolume(ctx context.Context, req *csi.CreateVolumeRequest,
cr *util.Credentials, rbdVol *rbdVolume, rbdSnap *rbdSnapshot) (*csi.CreateVolumeResponse, error) { cr *util.Credentials, rbdVol, parentVol *rbdVolume, rbdSnap *rbdSnapshot) (*csi.CreateVolumeResponse, error) {
vcs := req.GetVolumeContentSource() vcs := req.GetVolumeContentSource()
switch { switch {
@ -364,9 +364,29 @@ func (cs *ControllerServer) repairExistingVolume(ctx context.Context, req *csi.C
return nil, err return nil, err
} }
// rbdVol is a clone from an other volume // rbdVol is a clone from parentVol
case vcs.GetVolume() != nil: case vcs.GetVolume() != nil:
// TODO: is there really nothing to repair on image clone? // When cloning into a thick-provisioned volume was happening,
// the image should be marked as thick-provisioned, unless it
// was aborted in flight. In order to restart the
// thick-cloning, delete the volume and let the caller retry
// from the start.
if isThickProvisionRequest(req.GetParameters()) {
thick, err := rbdVol.isThickProvisioned()
if err != nil {
return nil, status.Errorf(codes.Aborted, "failed to verify thick-provisioned volume %q: %s", rbdVol, err)
} else if !thick {
err = cleanUpSnapshot(ctx, parentVol, rbdSnap, rbdVol, cr)
if err != nil {
return nil, status.Errorf(codes.Aborted, "failed to remove partially cloned volume %q: %s", rbdVol, err)
}
err = undoVolReservation(ctx, rbdVol, cr)
if err != nil {
return nil, status.Errorf(codes.Aborted, "failed to remove volume %q from journal: %s", rbdVol, err)
}
return nil, status.Errorf(codes.Aborted, "cloning thick-provisioned volume %q has been interrupted, please retry", rbdVol)
}
}
} }
return buildCreateVolumeResponse(req, rbdVol), nil return buildCreateVolumeResponse(req, rbdVol), nil