Refactor voljournal to aid reuse with CephFS

and to also inmprove the code reuse in rbd itself.

Signed-off-by: ShyamsundarR <srangana@redhat.com>
This commit is contained in:
ShyamsundarR 2019-05-14 15:15:01 -04:00 committed by mergify[bot]
parent b6b7cf2c3d
commit 1406f29dcd
11 changed files with 767 additions and 686 deletions

View File

@ -146,7 +146,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
}
defer func() {
if err != nil {
errDefer := unreserveVol(rbdVol, req.GetSecrets())
errDefer := undoVolReservation(rbdVol, req.GetSecrets())
if errDefer != nil {
klog.Warningf("failed undoing reservation of volume: %s (%s)", req.GetName(), errDefer)
}
@ -257,7 +257,7 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
}
}()
if err := unreserveVol(rbdVol, req.GetSecrets()); err != nil {
if err := undoVolReservation(rbdVol, req.GetSecrets()); err != nil {
return nil, status.Error(codes.Internal, err.Error())
}
return &csi.DeleteVolumeResponse{}, nil
@ -347,7 +347,7 @@ func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateS
// check for the requested source volume id and already allocated source volume id
found, err := checkSnapExists(rbdSnap, req.GetSecrets())
if err != nil {
if _, ok := err.(ErrSnapNameConflict); ok {
if _, ok := err.(util.ErrSnapNameConflict); ok {
return nil, status.Error(codes.AlreadyExists, err.Error())
}
@ -371,7 +371,7 @@ func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateS
}
defer func() {
if err != nil {
errDefer := unreserveSnap(rbdSnap, req.GetSecrets())
errDefer := undoSnapReservation(rbdSnap, req.GetSecrets())
if errDefer != nil {
klog.Warningf("failed undoing reservation of snapshot: %s %v", req.GetName(), errDefer)
}
@ -483,7 +483,7 @@ func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteS
if _, ok := err.(ErrSnapNotFound); !ok {
return nil, status.Error(codes.Internal, err.Error())
}
if err := unreserveSnap(rbdSnap, req.GetSecrets()); err != nil {
if err := undoSnapReservation(rbdSnap, req.GetSecrets()); err != nil {
return nil, status.Error(codes.Internal, err.Error())
}
return &csi.DeleteSnapshotResponse{}, nil

View File

@ -37,17 +37,6 @@ func (e ErrSnapNotFound) Error() string {
return e.err.Error()
}
// ErrSnapNameConflict is generated when a requested CSI snap name already exists on RBD but with
// different properties, and hence is in conflict with the passed in CSI volume name
type ErrSnapNameConflict struct {
requestName string
err error
}
func (e ErrSnapNameConflict) Error() string {
return e.err.Error()
}
// ErrVolNameConflict is generated when a requested CSI volume name already exists on RBD but with
// different properties, and hence is in conflict with the passed in CSI volume name
type ErrVolNameConflict struct {

View File

@ -123,7 +123,7 @@ func (ns *NodeServer) getVolumeName(req *csi.NodePublishVolumeRequest) (string,
return "", status.Error(codes.InvalidArgument, err.Error())
}
return rbdImgNamePrefix + vi.ObjectUUID, nil
return volJournal.NamingPrefix() + vi.ObjectUUID, nil
}
func (ns *NodeServer) mountVolume(req *csi.NodePublishVolumeRequest, devicePath string) error {

View File

@ -27,70 +27,6 @@ import (
"k8s.io/utils/nsenter"
)
/*
RADOS omaps usage:
This note details how we preserve idempotent nature of create requests and retain the relationship
between orchestrator (CO) generated Names and plugin generated names for images and snapshots
The implementation uses Ceph RADOS omaps to preserve the relationship between request name and
generated image (or snapshot) name. There are 4 types of omaps in use,
- A "csi.volumes.[csi-id]" (or "csi.volumes"+.+CSIInstanceID), we call this the csiVolsDirectory
- stores keys named using the CO generated names for volume requests
- keys are named "csi.volume."+[CO generated VolName]
- Key value contains the RBD image uuid that is created or will be created, for the CO provided
name
- A "csi.snaps.[csi-id]" (or "csi.snaps"+.+CSIInstanceID), we refer to this as the csiSnapsDirectory
- stores keys named using the CO generated names for snapshot requests
- keys are named "csi.snap."+[CO generated SnapName]
- Key value contains the RBD snapshot uuid that is created or will be created, for the CO
provided name
- A per image omap named "rbd.csi.volume."+[RBD image uuid], we refer to this as the rbdImageOMap
- stores a single key named "csi.volname", that has the value of the CO generated VolName that
this image refers to
- A per snapshot omap named "rbd.csi.snap."+[RBD snapshot uuid], we refer to this as the snapOMap
- stores a key named "csi.snapname", that has the value of the CO generated SnapName that this
snapshot refers to
- also stores another key named "csi.source", that has the value of the image name that is the
source of the snapshot
Creation of omaps:
When a volume create request is received (or a snapshot create, the snapshot is not detailed in this
comment further as the process is similar),
- The csiVolsDirectory is consulted to find if there is already a key with the CO VolName, and if present,
it is used to read its references to reach the RBD image that backs this VolName, to check if the
RBD image can satisfy the requirements for the request
- If during the process of checking the same, it is found that some linking information is stale
or missing, the corresponding keys upto the key in the csiVolsDirectory is cleaned up, to start afresh
- If the key with the CO VolName is not found, or was cleaned up, the request is treated as a
new create request, and an rbdImageOMap is created first with a generated uuid, this ensures that we
do not use a uuid that is already in use
- Next, a key with the VolName is created in the csiVolsDirectory, and its value is updated to store the
generated uuid
- This is followed by updating the rbdImageOMap with the VolName in the rbdImageCSIVolNameKey
- Finally, the image is created (or promoted from a snapshot, if content source was provided) using
the uuid and a corresponding image name prefix (rbdImgNamePrefix or rbdSnapNamePrefix)
The entire operation is locked based on VolName hash, to ensure there is only ever a single entity
modifying the related omaps for a given VolName.
This ensures idempotent nature of creates, as the same CO generated VolName would attempt to use
the same RBD image name to serve the request, as the relations are saved in the respective omaps.
Deletion of omaps:
Delete requests would not contain the VolName, hence deletion uses the volume ID, which is encoded
with the image name in it, to find the image and the rbdImageOMap. The rbdImageOMap is read to get
the VolName that this image points to. This VolName can be further used to read and delete the key
from the csiVolsDirectory.
As we trace back and find the VolName, we also take a hash based lock on the VolName before
proceeding with deleting the image and the related omap entries, to ensure there is only ever a
single entity modifying the related omaps for a given VolName.
*/
const (
// volIDVersion is the version number of volume ID encoding scheme
volIDVersion uint16 = 1
@ -99,34 +35,8 @@ const (
// csiConfigFile is the location of the CSI config file
csiConfigFile = "/etc/ceph-csi-config/config.json"
// CSI volume-name keyname prefix, for key in csiVolsDirectory, suffix is the CSI passed volume name
csiVolNameKeyPrefix = "csi.volume."
// Per RBD image object map name prefix, suffix is the RBD image uuid
rbdImageOMapPrefix = "csi.volume."
// CSI volume-name key in per RBD image object map, containing CSI volume-name for which the
// image was created
rbdImageCSIVolNameKey = "csi.volname"
// RBD image name prefix, suffix is a uuid generated per image
rbdImgNamePrefix = "csi-vol-"
//CSI snap-name keyname prefix, for key in csiSnapsDirectory, suffix is the CSI passed snapshot name
csiSnapNameKeyPrefix = "csi.snap."
// Per RBD snapshot object map name prefix, suffix is the RBD image uuid
rbdSnapOMapPrefix = "csi.snap."
// CSI snap-name key in per RBD snapshot object map, containing CSI snapshot-name for which the
// snapshot was created
rbdSnapCSISnapNameKey = "csi.snapname"
// source image name key in per RBD snapshot object map, containing RBD source image name for
// which the snapshot was created
rbdSnapSourceImageKey = "csi.source"
// RBD snapshot name prefix, suffix is a uuid generated per snapshot
rbdSnapNamePrefix = "csi-snap-"
)
// PluginFolder defines the location of ceph plugin
var PluginFolder = "/var/lib/kubelet/plugins/"
// Driver contains the default identity,node and controller struct
type Driver struct {
cd *csicommon.CSIDriver
@ -138,14 +48,18 @@ type Driver struct {
var (
version = "1.0.0"
// PluginFolder defines the location of ceph plugin
PluginFolder = "/var/lib/kubelet/plugins/"
// CSIInstanceID is the instance ID that is unique to an instance of CSI, used when sharing
// ceph clusters across CSI instances, to differentiate omap names per CSI instance
CSIInstanceID = "default"
// csiVolsDirectory is the name of the CSI volumes object map that contains CSI volume-name
// based keys
csiVolsDirectory = "csi.volumes"
// csiSnapsDirectory is the name of the CSI snapshots object map that contains CSI snapshot-name based keys
csiSnapsDirectory = "csi.snaps"
// volJournal and snapJournal are used to maintain RADOS based journals for CO generated
// VolumeName to backing RBD images
volJournal *util.CSIJournal
snapJournal *util.CSIJournal
)
// NewDriver returns new rbd driver
@ -199,8 +113,14 @@ func (r *Driver) Run(driverName, nodeID, endpoint, instanceID string, containeri
if instanceID != "" {
CSIInstanceID = instanceID
}
csiVolsDirectory = csiVolsDirectory + "." + CSIInstanceID
csiSnapsDirectory = csiSnapsDirectory + "." + CSIInstanceID
// Get an instance of the volume and snapshot journal keys
volJournal = util.NewCSIVolumeJournal()
snapJournal = util.NewCSISnapshotJournal()
// Update keys with CSI instance suffix
volJournal.SetCSIDirectorySuffix(CSIInstanceID)
snapJournal.SetCSIDirectorySuffix(CSIInstanceID)
// Initialize default library driver
r.cd = csicommon.NewCSIDriver(driverName, version, nodeID)

307
pkg/rbd/rbd_journal.go Normal file
View File

@ -0,0 +1,307 @@
/*
Copyright 2019 The Ceph-CSI Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rbd
import (
"fmt"
"github.com/ceph/ceph-csi/pkg/util"
"github.com/pkg/errors"
"k8s.io/klog"
)
func validateNonEmptyField(field, fieldName, structName string) error {
if field == "" {
return fmt.Errorf("value '%s' in '%s' structure cannot be empty", fieldName, structName)
}
return nil
}
func validateRbdSnap(rbdSnap *rbdSnapshot) error {
var err error
if err = validateNonEmptyField(rbdSnap.RequestName, "RequestName", "rbdSnapshot"); err != nil {
return err
}
if err = validateNonEmptyField(rbdSnap.Monitors, "Monitors", "rbdSnapshot"); err != nil {
return err
}
if err = validateNonEmptyField(rbdSnap.AdminID, "AdminID", "rbdSnapshot"); err != nil {
return err
}
if err = validateNonEmptyField(rbdSnap.Pool, "Pool", "rbdSnapshot"); err != nil {
return err
}
if err = validateNonEmptyField(rbdSnap.RbdImageName, "RbdImageName", "rbdSnapshot"); err != nil {
return err
}
if err = validateNonEmptyField(rbdSnap.ClusterID, "ClusterID", "rbdSnapshot"); err != nil {
return err
}
return err
}
func validateRbdVol(rbdVol *rbdVolume) error {
var err error
if err = validateNonEmptyField(rbdVol.RequestName, "RequestName", "rbdVolume"); err != nil {
return err
}
if err = validateNonEmptyField(rbdVol.Monitors, "Monitors", "rbdVolume"); err != nil {
return err
}
if err = validateNonEmptyField(rbdVol.AdminID, "AdminID", "rbdVolume"); err != nil {
return err
}
if err = validateNonEmptyField(rbdVol.Pool, "Pool", "rbdVolume"); err != nil {
return err
}
if err = validateNonEmptyField(rbdVol.ClusterID, "ClusterID", "rbdVolume"); err != nil {
return err
}
if rbdVol.VolSize == 0 {
return errors.New("value 'VolSize' in 'rbdVolume' structure cannot be 0")
}
return err
}
/*
checkSnapExists, and its counterpart checkVolExists, function checks if the passed in rbdSnapshot
or rbdVolume exists on the backend.
**NOTE:** These functions manipulate the rados omaps that hold information regarding
volume names as requested by the CSI drivers. Hence, these need to be invoked only when the
respective CSI driver generated snapshot or volume name based locks are held, as otherwise racy
access to these omaps may end up leaving them in an inconsistent state.
These functions need enough information about cluster and pool (ie, Monitors, Pool, IDs filled in)
to operate. They further require that the RequestName element of the structure have a valid value
to operate on and determine if the said RequestName already exists on the backend.
These functions populate the snapshot or the image name, its attributes and the CSI snapshot/volume
ID for the same when successful.
These functions also cleanup omap reservations that are stale. I.e when omap entries exist and
backing images or snapshots are missing, or one of the omaps exist and the next is missing. This is
because, the order of omap creation and deletion are inverse of each other, and protected by the
request name lock, and hence any stale omaps are leftovers from incomplete transactions and are
hence safe to garbage collect.
*/
func checkSnapExists(rbdSnap *rbdSnapshot, credentials map[string]string) (bool, error) {
err := validateRbdSnap(rbdSnap)
if err != nil {
return false, err
}
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return false, err
}
snapUUID, err := snapJournal.CheckReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.RequestName, rbdSnap.RbdImageName)
if err != nil {
return false, err
}
if snapUUID == "" {
return false, nil
}
rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + snapUUID
// Fetch on-disk image attributes
err = updateSnapWithImageInfo(rbdSnap, credentials)
if err != nil {
if _, ok := err.(ErrSnapNotFound); ok {
err = snapJournal.UndoReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.RbdSnapName, rbdSnap.RequestName)
return false, err
}
return false, err
}
// found a snapshot already available, process and return its information
rbdSnap.SnapID, err = util.GenerateVolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.ClusterID, snapUUID, volIDVersion)
if err != nil {
return false, err
}
klog.V(4).Infof("found existing snap (%s) with snap name (%s) for request (%s)",
rbdSnap.SnapID, rbdSnap.RbdSnapName, rbdSnap.RequestName)
return true, nil
}
/*
Check comment on checkSnapExists, to understand how this function behaves
**NOTE:** These functions manipulate the rados omaps that hold information regarding
volume names as requested by the CSI drivers. Hence, these need to be invoked only when the
respective CSI snapshot or volume name based locks are held, as otherwise racy access to these
omaps may end up leaving the omaps in an inconsistent state.
*/
func checkVolExists(rbdVol *rbdVolume, credentials map[string]string) (bool, error) {
err := validateRbdVol(rbdVol)
if err != nil {
return false, err
}
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return false, err
}
imageUUID, err := volJournal.CheckReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.RequestName, "")
if err != nil {
return false, err
}
if imageUUID == "" {
return false, nil
}
rbdVol.RbdImageName = volJournal.NamingPrefix() + imageUUID
// NOTE: Return volsize should be on-disk volsize, not request vol size, so
// save it for size checks before fetching image data
requestSize := rbdVol.VolSize
// Fetch on-disk image attributes and compare against request
err = updateVolWithImageInfo(rbdVol, credentials)
if err != nil {
if _, ok := err.(ErrImageNotFound); ok {
err = volJournal.UndoReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.RbdImageName, rbdVol.RequestName)
return false, err
}
return false, err
}
// size checks
if rbdVol.VolSize < requestSize {
err = fmt.Errorf("image with the same name (%s) but with different size already exists",
rbdVol.RbdImageName)
return false, ErrVolNameConflict{rbdVol.RbdImageName, err}
}
// TODO: We should also ensure image features and format is the same
// found a volume already available, process and return it!
rbdVol.VolID, err = util.GenerateVolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.ClusterID, imageUUID, volIDVersion)
if err != nil {
return false, err
}
klog.V(4).Infof("found existng volume (%s) with image name (%s) for request (%s)",
rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName)
return true, nil
}
// reserveSnap is a helper routine to request a rbdSnapshot name reservation and generate the
// volume ID for the generated name
func reserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error {
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return err
}
snapUUID, err := snapJournal.ReserveName(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.RequestName, rbdSnap.RbdImageName)
if err != nil {
return err
}
rbdSnap.SnapID, err = util.GenerateVolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.ClusterID, snapUUID, volIDVersion)
if err != nil {
return err
}
rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + snapUUID
klog.V(4).Infof("generated Volume ID (%s) and image name (%s) for request name (%s)",
rbdSnap.SnapID, rbdSnap.RbdImageName, rbdSnap.RequestName)
return nil
}
// reserveVol is a helper routine to request a rbdVolume name reservation and generate the
// volume ID for the generated name
func reserveVol(rbdVol *rbdVolume, credentials map[string]string) error {
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return err
}
imageUUID, err := volJournal.ReserveName(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.RequestName, "")
if err != nil {
return err
}
rbdVol.VolID, err = util.GenerateVolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.ClusterID, imageUUID, volIDVersion)
if err != nil {
return err
}
rbdVol.RbdImageName = volJournal.NamingPrefix() + imageUUID
klog.V(4).Infof("generated Volume ID (%s) and image name (%s) for request name (%s)",
rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName)
return nil
}
// undoSnapReservation is a helper routine to undo a name reservation for rbdSnapshot
func undoSnapReservation(rbdSnap *rbdSnapshot, credentials map[string]string) error {
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return err
}
err = snapJournal.UndoReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnap.RbdSnapName, rbdSnap.RequestName)
return err
}
// undoVolReservation is a helper routine to undo a name reservation for rbdVolume
func undoVolReservation(rbdVol *rbdVolume, credentials map[string]string) error {
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return err
}
err = volJournal.UndoReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdVol.RbdImageName, rbdVol.RequestName)
return err
}

View File

@ -211,7 +211,7 @@ func deleteImage(pOpts *rbdVolume, adminID string, credentials map[string]string
return err
}
err = unreserveVol(pOpts, credentials)
err = undoVolReservation(pOpts, credentials)
if err != nil {
klog.Errorf("failed to remove reservation for volume (%s) with backing image (%s) (%s)",
pOpts.RequestName, pOpts.RbdImageName, err)
@ -292,7 +292,7 @@ func genSnapFromSnapID(rbdSnap *rbdSnapshot, snapshotID string, credentials map[
rbdSnap.ClusterID = vi.ClusterID
options["clusterID"] = rbdSnap.ClusterID
rbdSnap.RbdSnapName = rbdSnapNamePrefix + vi.ObjectUUID
rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + vi.ObjectUUID
rbdSnap.Monitors, _, err = getMonsAndClusterID(options)
if err != nil {
@ -311,16 +311,8 @@ func genSnapFromSnapID(rbdSnap *rbdSnapshot, snapshotID string, credentials map[
return err
}
// TODO: fetch all omap vals in one call, than make multiple listomapvals
snapUUID := strings.TrimPrefix(rbdSnap.RbdSnapName, rbdSnapNamePrefix)
rbdSnap.RequestName, err = util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID,
key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey)
if err != nil {
return err
}
rbdSnap.RbdImageName, err = util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID,
key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey)
rbdSnap.RequestName, rbdSnap.RbdImageName, err = snapJournal.GetObjectUUIDData(rbdSnap.Monitors,
rbdSnap.AdminID, key, rbdSnap.Pool, vi.ObjectUUID, true)
if err != nil {
return err
}
@ -352,7 +344,7 @@ func genVolFromVolID(rbdVol *rbdVolume, volumeID string, credentials map[string]
rbdVol.ClusterID = vi.ClusterID
options["clusterID"] = rbdVol.ClusterID
rbdVol.RbdImageName = rbdImgNamePrefix + vi.ObjectUUID
rbdVol.RbdImageName = volJournal.NamingPrefix() + vi.ObjectUUID
rbdVol.Monitors, _, err = getMonsAndClusterID(options)
if err != nil {
@ -372,9 +364,8 @@ func genVolFromVolID(rbdVol *rbdVolume, volumeID string, credentials map[string]
return err
}
imageUUID := strings.TrimPrefix(rbdVol.RbdImageName, rbdImgNamePrefix)
rbdVol.RequestName, err = util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID,
key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey)
rbdVol.RequestName, _, err = volJournal.GetObjectUUIDData(rbdVol.Monitors,
rbdVol.AdminID, key, rbdVol.Pool, vi.ObjectUUID, false)
if err != nil {
return err
}
@ -608,7 +599,7 @@ func deleteSnapshot(pOpts *rbdSnapshot, adminID string, credentials map[string]s
return errors.Wrapf(err, "failed to delete snapshot, command output: %s", string(output))
}
if err := unreserveSnap(pOpts, credentials); err != nil {
if err := undoSnapReservation(pOpts, credentials); err != nil {
klog.Errorf("failed to remove reservation for snapname (%s) with backing snap (%s) on image (%s) (%s)",
pOpts.RequestName, pOpts.RbdSnapName, pOpts.RbdImageName, err)
}

View File

@ -1,554 +0,0 @@
/*
Copyright 2019 The Ceph-CSI Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package rbd
import (
"fmt"
"strings"
"github.com/ceph/ceph-csi/pkg/util"
"github.com/pborman/uuid"
"github.com/pkg/errors"
"k8s.io/klog"
)
func validateNonEmptyField(field, fieldName, structName string) error {
if field == "" {
return fmt.Errorf("value '%s' in '%s' structure cannot be empty", fieldName, structName)
}
return nil
}
func validateRbdSnap(rbdSnap *rbdSnapshot) error {
if err := validateNonEmptyField(rbdSnap.RequestName, "RequestName", "rbdSnapshot"); err != nil {
return err
}
if err := validateNonEmptyField(rbdSnap.Monitors, "Monitors", "rbdSnapshot"); err != nil {
return err
}
if err := validateNonEmptyField(rbdSnap.AdminID, "AdminID", "rbdSnapshot"); err != nil {
return err
}
if err := validateNonEmptyField(rbdSnap.Pool, "Pool", "rbdSnapshot"); err != nil {
return err
}
if err := validateNonEmptyField(rbdSnap.RbdImageName, "RbdImageName", "rbdSnapshot"); err != nil {
return err
}
if err := validateNonEmptyField(rbdSnap.ClusterID, "ClusterID", "rbdSnapshot"); err != nil {
return err
}
return nil
}
func validateRbdVol(rbdVol *rbdVolume) error {
if err := validateNonEmptyField(rbdVol.RequestName, "RequestName", "rbdVolume"); err != nil {
return err
}
if err := validateNonEmptyField(rbdVol.Monitors, "Monitors", "rbdVolume"); err != nil {
return err
}
if err := validateNonEmptyField(rbdVol.AdminID, "AdminID", "rbdVolume"); err != nil {
return err
}
if err := validateNonEmptyField(rbdVol.Pool, "Pool", "rbdVolume"); err != nil {
return err
}
if err := validateNonEmptyField(rbdVol.ClusterID, "ClusterID", "rbdVolume"); err != nil {
return err
}
if rbdVol.VolSize == 0 {
return errors.New("value 'VolSize' in 'rbdVolume' structure cannot be 0")
}
return nil
}
/*
checkSnapExists, and its counterpart checkVolExists, function as checks to determine if passed
in rbdSnapshot or rbdVolume exists on the backend.
**NOTE:** These functions manipulate the rados omaps that hold information regarding
volume names as requested by the CSI drivers. Hence, these need to be invoked only when the
respective CSI driver generated snapshot or volume name based locks are held, as otherwise racy
access to these omaps may end up leaving them in an inconsistent state.
These functions need enough information about cluster and pool (ie, Monitors, Pool, IDs filled in)
to operate. They further require that the RequestName element of the structure have a valid value
to operate on and determine if the said RequestName already exists on the backend.
These functions populate the snapshot or the image name, its attributes and the CSI snapshot/volume
ID for the same when succesful.
These functions also cleanup omap reservations that are stale. I.e when omap entries exist and
backing images or snapshots are missing, or one of the omaps exist and the next is missing. This is
because, the order of omap creation and deletion are inverse of each other, and protected by the
request name lock, and hence any stale omaps are leftovers from incomplete transactions and are
hence safe to garbage collect.
*/
func checkSnapExists(rbdSnap *rbdSnapshot, credentials map[string]string) (found bool, err error) {
if err = validateRbdSnap(rbdSnap); err != nil {
return false, err
}
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return false, err
}
// check if request name is already part of the snaps omap
snapUUID, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID,
key, rbdSnap.Pool, csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName)
if err != nil {
// error should specifically be not found, for image to be absent, any other error
// is not conclusive, and we should not proceed
if _, ok := err.(util.ErrKeyNotFound); ok {
return false, nil
}
return false, err
}
rbdSnap.RbdSnapName = rbdSnapNamePrefix + snapUUID
// TODO: use listomapvals to dump all keys instead of reading them one-by-one
// check if the snapshot image omap is present
savedSnapName, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID,
key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey)
if err != nil {
if _, ok := err.(util.ErrKeyNotFound); ok {
err = unreserveSnap(rbdSnap, credentials)
}
return false, err
}
// check if snapshot image omap points back to the request name
if savedSnapName != rbdSnap.RequestName {
// NOTE: This should never be possible, hence no cleanup, but log error
// and return, as cleanup may need to occur manually!
return false, fmt.Errorf("internal state inconsistent, omap snap"+
" names disagree, request name (%s) snap name (%s) image omap"+
" snap name (%s)", rbdSnap.RequestName, rbdSnap.RbdSnapName, savedSnapName)
}
// check if the snapshot source image omap is present
savedVolName, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID,
key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey)
if err != nil {
if _, ok := err.(util.ErrKeyNotFound); ok {
err = unreserveSnap(rbdSnap, credentials)
}
return false, err
}
// check if snapshot source image omap points back to the source volume passed in
if savedVolName != rbdSnap.RbdImageName {
// NOTE: This can happen if there is a snapname conflict, and we alerady have a snapshot
// with the same name pointing to a different RBD image as the source
err = fmt.Errorf("snapname points to different image, request name (%s)"+
" image name (%s) image omap"+" volume name (%s)",
rbdSnap.RequestName, rbdSnap.RbdImageName, savedVolName)
return false, ErrSnapNameConflict{rbdSnap.RequestName, err}
}
// Fetch on-disk image attributes
err = updateSnapWithImageInfo(rbdSnap, credentials)
if err != nil {
if _, ok := err.(ErrSnapNotFound); ok {
err = unreserveSnap(rbdSnap, credentials)
return false, err
}
return false, err
}
// found a snapshot already available, process and return its information
poolID, err := util.GetPoolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool)
if err != nil {
return false, err
}
vi := util.CSIIdentifier{
PoolID: poolID,
EncodingVersion: volIDVersion,
ClusterID: rbdSnap.ClusterID,
ObjectUUID: snapUUID,
}
rbdSnap.SnapID, err = vi.ComposeCSIID()
if err != nil {
return false, err
}
klog.V(4).Infof("Found existing snap (%s) with snap name (%s) for request (%s)",
rbdSnap.SnapID, rbdSnap.RbdSnapName, rbdSnap.RequestName)
return true, nil
}
/*
Check comment on checkSnapExists, to understand how this function behaves
**NOTE:** These functions manipulate the rados omaps that hold information regarding
volume names as requested by the CSI drivers. Hence, these need to be invoked only when the
respective CSI snapshot or volume name based locks are held, as otherwise racy access to these
omaps may end up leaving the omaps in an inconsistent state.
*/
func checkVolExists(rbdVol *rbdVolume, credentials map[string]string) (found bool, err error) {
var vi util.CSIIdentifier
if err = validateRbdVol(rbdVol); err != nil {
return false, err
}
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return false, err
}
// check if request name is already part of the volumes omap
imageUUID, err := util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID,
key, rbdVol.Pool, csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName)
if err != nil {
// error should specifically be not found, for image to be absent, any other error
// is not conclusive, and we should not proceed
if _, ok := err.(util.ErrKeyNotFound); ok {
return false, nil
}
return false, err
}
rbdVol.RbdImageName = rbdImgNamePrefix + imageUUID
// check if the image omap is present
savedVolName, err := util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID,
key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey)
if err != nil {
if _, ok := err.(util.ErrKeyNotFound); ok {
err = unreserveVol(rbdVol, credentials)
}
return false, err
}
// check if image omap points back to the request name
if savedVolName != rbdVol.RequestName {
// NOTE: This should never be possible, hence no cleanup, but log error
// and return, as cleanup may need to occur manually!
return false, fmt.Errorf("internal state inconsistent, omap volume"+
" names disagree, request name (%s) image name (%s) image omap"+
" volume name (%s)", rbdVol.RequestName, rbdVol.RbdImageName, savedVolName)
}
// NOTE: Return volsize should be on-disk volsize, not request vol size, so
// save it for size checks before fetching image data
requestSize := rbdVol.VolSize
// Fetch on-disk image attributes and compare against request
err = updateVolWithImageInfo(rbdVol, credentials)
if err != nil {
if _, ok := err.(ErrImageNotFound); ok {
err = unreserveVol(rbdVol, credentials)
return false, err
}
return false, err
}
// size checks
if rbdVol.VolSize < requestSize {
err = fmt.Errorf("image with the same name (%s) but with different size already exists",
rbdVol.RbdImageName)
return false, ErrVolNameConflict{rbdVol.RbdImageName, err}
}
// TODO: We should also ensure image features and format is the same
// found a volume already available, process and return it!
poolID, err := util.GetPoolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool)
if err != nil {
return false, err
}
vi = util.CSIIdentifier{
PoolID: poolID,
EncodingVersion: volIDVersion,
ClusterID: rbdVol.ClusterID,
ObjectUUID: imageUUID,
}
rbdVol.VolID, err = vi.ComposeCSIID()
if err != nil {
return false, err
}
klog.V(4).Infof("Found existng volume (%s) with image name (%s) for request (%s)",
rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName)
return true, nil
}
/*
unreserveSnap and unreserveVol remove omaps associated with the snapshot and the image name,
and also remove the corresponding request name key in the snaps or volumes omaps respectively.
This is performed within the request name lock, to ensure that requests with the same name do not
manipulate the omap entries concurrently.
*/
func unreserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error {
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return err
}
// delete snap image omap (first, inverse of create order)
snapUUID := strings.TrimPrefix(rbdSnap.RbdSnapName, rbdSnapNamePrefix)
err = util.RemoveObject(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID)
if err != nil {
if _, ok := err.(util.ErrObjectNotFound); !ok {
klog.Errorf("failed removing oMap %s (%s)", rbdSnapOMapPrefix+snapUUID, err)
return err
}
}
// delete the request name omap key (last, inverse of create order)
err = util.RemoveOMapKey(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName)
if err != nil {
klog.Errorf("failed removing oMap key %s (%s)", csiSnapNameKeyPrefix+rbdSnap.RequestName, err)
return err
}
return nil
}
func unreserveVol(rbdVol *rbdVolume, credentials map[string]string) error {
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return err
}
// delete image omap (first, inverse of create order)
imageUUID := strings.TrimPrefix(rbdVol.RbdImageName, rbdImgNamePrefix)
err = util.RemoveObject(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID)
if err != nil {
if _, ok := err.(util.ErrObjectNotFound); !ok {
klog.Errorf("failed removing oMap %s (%s)", rbdImageOMapPrefix+imageUUID, err)
return err
}
}
// delete the request name omap key (last, inverse of create order)
err = util.RemoveOMapKey(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName)
if err != nil {
klog.Errorf("failed removing oMap key %s (%s)", csiVolNameKeyPrefix+rbdVol.RequestName, err)
return err
}
return nil
}
// reserveOMapName creates an omap with passed in oMapNamePrefix and a generated <uuid>.
// It ensures generated omap name does not already exist and if conflicts are detected, a set
// number of retires with newer uuids are attempted before returning an error
func reserveOMapName(monitors, adminID, key, poolName, oMapNamePrefix string) (string, error) {
var iterUUID string
maxAttempts := 5
attempt := 1
for attempt <= maxAttempts {
// generate a uuid for the image name
iterUUID = uuid.NewUUID().String()
err := util.CreateObject(monitors, adminID, key, poolName, oMapNamePrefix+iterUUID)
if err != nil {
if _, ok := err.(util.ErrObjectExists); ok {
attempt++
// try again with a different uuid, for maxAttempts tries
klog.V(4).Infof("uuid (%s) conflict detected, retrying (attempt %d of %d)",
iterUUID, attempt, maxAttempts)
continue
}
return "", err
}
break
}
if attempt > maxAttempts {
return "", errors.New("uuid conflicts exceeds retry threshold")
}
return iterUUID, nil
}
/*
reserveSnap and reserveVol add respective entries to the volumes and snapshots omaps, post
generating a target snapshot or image name for use. Further, these functions create the snapshot or
image name omaps, to store back pointers to the CSI generated request names.
This is performed within the request name lock, to ensure that requests with the same name do not
manipulate the omap entries concurrently.
*/
func reserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error {
var vi util.CSIIdentifier
key, err := getKey(rbdSnap.AdminID, credentials)
if err != nil {
return err
}
poolID, err := util.GetPoolID(rbdSnap.Monitors, rbdSnap.AdminID, key,
rbdSnap.Pool)
if err != nil {
return err
}
// Create the snapUUID based omap first, to reserve the same and avoid conflicts
// NOTE: If any service loss occurs post creation of the snap omap, and before
// setting the omap key (rbdSnapCSISnapNameKey) to point back to the snaps omap, the
// snap omap key will leak
snapUUID, err := reserveOMapName(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnapOMapPrefix)
if err != nil {
return err
}
// Create request snapUUID key in csi snaps omap and store the uuid based
// snap name into it
err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key,
rbdSnap.Pool, csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName, snapUUID)
if err != nil {
return err
}
defer func() {
if err != nil {
klog.Warningf("reservation failed for volume: %s", rbdSnap.RequestName)
errDefer := unreserveSnap(rbdSnap, credentials)
if errDefer != nil {
klog.Warningf("failed undoing reservation of snapshot: %s (%v)",
rbdSnap.RequestName, errDefer)
}
}
}()
// Create snap name based omap and store CSI request name key and source information
err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey, rbdSnap.RequestName)
if err != nil {
return err
}
err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool,
rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey, rbdSnap.RbdImageName)
if err != nil {
return err
}
// generate the volume ID to return to the CO system
vi = util.CSIIdentifier{
PoolID: poolID,
EncodingVersion: volIDVersion,
ClusterID: rbdSnap.ClusterID,
ObjectUUID: snapUUID,
}
rbdSnap.SnapID, err = vi.ComposeCSIID()
if err != nil {
return err
}
rbdSnap.RbdSnapName = rbdSnapNamePrefix + snapUUID
klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)",
rbdSnap.SnapID, rbdSnap.RbdImageName, rbdSnap.RequestName)
return nil
}
func reserveVol(rbdVol *rbdVolume, credentials map[string]string) error {
var vi util.CSIIdentifier
key, err := getKey(rbdVol.AdminID, credentials)
if err != nil {
return err
}
poolID, err := util.GetPoolID(rbdVol.Monitors, rbdVol.AdminID, key,
rbdVol.Pool)
if err != nil {
return err
}
// Create the imageUUID based omap first, to reserve the same and avoid conflicts
// NOTE: If any service loss occurs post creation of the image omap, and before
// setting the omap key (rbdImageCSIVolNameKey) to point back to the volumes omap,
// the image omap key will leak
imageUUID, err := reserveOMapName(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, rbdImageOMapPrefix)
if err != nil {
return err
}
// Create request volName key in csi volumes omap and store the uuid based
// image name into it
err = util.SetOMapKeyValue(rbdVol.Monitors, rbdVol.AdminID, key,
rbdVol.Pool, csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName, imageUUID)
if err != nil {
return err
}
defer func() {
if err != nil {
klog.Warningf("reservation failed for volume: %s", rbdVol.RequestName)
errDefer := unreserveVol(rbdVol, credentials)
if errDefer != nil {
klog.Warningf("failed undoing reservation of volume: %s (%v)",
rbdVol.RequestName, errDefer)
}
}
}()
// Create image name based omap and store CSI request volume name key and data
err = util.SetOMapKeyValue(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool,
rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey, rbdVol.RequestName)
if err != nil {
return err
}
// generate the volume ID to return to the CO system
vi = util.CSIIdentifier{
PoolID: poolID,
EncodingVersion: volIDVersion,
ClusterID: rbdVol.ClusterID,
ObjectUUID: imageUUID,
}
rbdVol.VolID, err = vi.ComposeCSIID()
if err != nil {
return err
}
rbdVol.RbdImageName = rbdImgNamePrefix + imageUUID
klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)",
rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName)
return nil
}

View File

@ -164,7 +164,8 @@ func GetOMapValue(monitors, adminID, key, poolName, oMapName, oMapKey string) (s
"-p", poolName,
"getomapval", oMapName, oMapKey, tmpFile.Name())
if err != nil {
// no logs, as attempting to check for key/value is done even on regular call sequences
// no logs, as attempting to check for non-existent key/value is done even on
// regular call sequences
stdoutanderr := strings.Join([]string{string(stdout), string(stderr)}, " ")
if strings.Contains(stdoutanderr, "No such key: "+poolName+"/"+oMapName+"/"+oMapKey) {
return "", ErrKeyNotFound{poolName + "/" + oMapName + "/" + oMapKey, err}
@ -175,6 +176,10 @@ func GetOMapValue(monitors, adminID, key, poolName, oMapName, oMapKey string) (s
return "", ErrKeyNotFound{poolName + "/" + oMapName + "/" + oMapKey, err}
}
// log other errors for troubleshooting assistance
klog.Errorf("failed getting omap value for key (%s) from omap (%s) in pool (%s): (%v)",
oMapKey, oMapName, poolName, err)
return "", fmt.Errorf("error (%v) occured, command output streams is (%s)",
err.Error(), stdoutanderr)
}

View File

@ -45,3 +45,14 @@ type ErrObjectNotFound struct {
func (e ErrObjectNotFound) Error() string {
return e.err.Error()
}
// ErrSnapNameConflict is generated when a requested CSI snap name already exists on RBD but with
// different properties, and hence is in conflict with the passed in CSI volume name
type ErrSnapNameConflict struct {
requestName string
err error
}
func (e ErrSnapNameConflict) Error() string {
return e.err.Error()
}

View File

@ -91,3 +91,24 @@ func ValidateDriverName(driverName string) error {
}
return err
}
// GenerateVolID generates a volume ID based on passed in parameters and version, to be returned
// to the CO system
func GenerateVolID(monitors, id, key, pool, clusterID, objUUID string, volIDVersion uint16) (string, error) {
poolID, err := GetPoolID(monitors, id, key, pool)
if err != nil {
return "", err
}
// generate the volume ID to return to the CO system
vi := CSIIdentifier{
PoolID: poolID,
EncodingVersion: volIDVersion,
ClusterID: clusterID,
ObjectUUID: objUUID,
}
volID, err := vi.ComposeCSIID()
return volID, err
}

391
pkg/util/voljournal.go Normal file
View File

@ -0,0 +1,391 @@
/*
Copyright 2019 The Ceph-CSI Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"strings"
"github.com/pborman/uuid"
"github.com/pkg/errors"
"k8s.io/klog"
)
/*
RADOS omaps usage:
This note details how we preserve idempotent nature of create requests and retain the relationship
between orchestrator (CO) generated names and plugin generated names for volumes and snapshots.
NOTE: volume denotes an rbd image or a CephFS subvolume
The implementation uses Ceph RADOS omaps to preserve the relationship between request name and
generated volume (or snapshot) name. There are 4 types of omaps in use,
- A "csi.volumes.[csi-id]" (or "csi.volumes"+.+CSIInstanceID), (referred to using csiDirectory variable)
- stores keys named using the CO generated names for volume requests (prefixed with csiNameKeyPrefix)
- keys are named "csi.volume."+[CO generated VolName]
- Key value contains the volume uuid that is created, for the CO provided name
- A "csi.snaps.[csi-id]" (or "csi.snaps"+.+CSIInstanceID), (referred to using csiDirectory variable)
- stores keys named using the CO generated names for snapshot requests (prefixed with csiNameKeyPrefix)
- keys are named "csi.snap."+[CO generated SnapName]
- Key value contains the snapshot uuid that is created, for the CO provided name
- A per volume omap named "csi.volume."+[volume uuid], (referred to as CephUUIDDirectory)
- stores a single key named "csi.volname", that has the value of the CO generated VolName that
this volume refers to (referred to using csiNameKey value)
- A per snapshot omap named "rbd.csi.snap."+[RBD snapshot uuid], (referred to as CephUUIDDirectory)
- stores a key named "csi.snapname", that has the value of the CO generated SnapName that this
snapshot refers to (referred to using csiNameKey value)
- also stores another key named "csi.source", that has the value of the volume name that is the
source of the snapshot (referred to using cephSnapSourceKey value)
Creation of omaps:
When a volume create request is received (or a snapshot create, the snapshot is not detailed in this
comment further as the process is similar),
- The csiDirectory is consulted to find if there is already a key with the CO VolName, and if present,
it is used to read its references to reach the UUID that backs this VolName, to check if the
UUID based volume can satisfy the requirements for the request
- If during the process of checking the same, it is found that some linking information is stale
or missing, the corresponding keys upto the key in the csiDirectory is cleaned up, to start afresh
- If the key with the CO VolName is not found, or was cleaned up, the request is treated as a
new create request, and an CephUUIDDirectory is created first with a generated uuid, this ensures
that we do not use a uuid that is already in use
- Next, a key with the VolName is created in the csiDirectory, and its value is updated to store the
generated uuid
- This is followed by updating the CephUUIDDirectory with the VolName in the csiNameKey
- Finally, the volume is created (or promoted from a snapshot, if content source was provided),
using the uuid and a corresponding name prefix (namingPrefix) as the volume name
The entire operation is locked based on VolName hash, to ensure there is only ever a single entity
modifying the related omaps for a given VolName.
This ensures idempotent nature of creates, as the same CO generated VolName would attempt to use
the same volume uuid to serve the request, as the relations are saved in the respective omaps.
Deletion of omaps:
Delete requests would not contain the VolName, hence deletion uses the volume ID, which is encoded
with the volume uuid in it, to find the volume and the CephUUIDDirectory. The CephUUIDDirectory is
read to get the VolName that this image points to. This VolName can be further used to read and
delete the key from the csiDirectory.
As we trace back and find the VolName, we also take a hash based lock on the VolName before
proceeding with deleting the volume and the related omap entries, to ensure there is only ever a
single entity modifying the related omaps for a given VolName.
*/
type CSIJournal struct {
// csiDirectory is the name of the CSI volumes object map that contains CSI volume-name (or
// snapshot name) based keys
csiDirectory string
// CSI volume-name keyname prefix, for key in csiDirectory, suffix is the CSI passed volume name
csiNameKeyPrefix string
// Per Ceph volume (RBD/FS-subvolume) object map name prefix, suffix is the generated volume uuid
cephUUIDDirectoryPrefix string
// CSI volume-name key in per Ceph volume object map, containing CSI volume-name for which the
// Ceph volume was created
csiNameKey string
// source volume name key in per Ceph snapshot object map, containing Ceph source volume uuid
// for which the snapshot was created
cephSnapSourceKey string
// volume name prefix for naming on Ceph rbd or FS, suffix is a uuid generated per volume
namingPrefix string
}
// CSIVolumeJournal returns an instance of volume keys
func NewCSIVolumeJournal() *CSIJournal {
return &CSIJournal{
csiDirectory: "csi.volumes",
csiNameKeyPrefix: "csi.volume.",
cephUUIDDirectoryPrefix: "csi.volume.",
csiNameKey: "csi.volname",
namingPrefix: "csi-vol-",
cephSnapSourceKey: "",
}
}
// CSISnapshotSnapshot returns an instance of snapshot keys
func NewCSISnapshotJournal() *CSIJournal {
return &CSIJournal{
csiDirectory: "csi.snaps",
csiNameKeyPrefix: "csi.snap.",
cephUUIDDirectoryPrefix: "csi.snap.",
csiNameKey: "csi.snapname",
namingPrefix: "csi-snap-",
cephSnapSourceKey: "csi.source",
}
}
// NamingPrefix returns the value of naming prefix from the journal keys
func (cj *CSIJournal) NamingPrefix() string {
return cj.namingPrefix
}
// SetCSIDirectorySuffix sets the given suffix for the csiDirectory omap
func (cj *CSIJournal) SetCSIDirectorySuffix(suffix string) {
cj.csiDirectory = cj.csiDirectory + "." + suffix
}
/*
CheckReservation checks if given request name contains a valid reservation
- If there is a valid reservation, then the corresponding UUID for the volume/snapshot is returned
- If there is a reservation that is stale (or not fully cleaned up), it is garbage collected using
the UndoReservation call, as appropriate
- If a snapshot is being checked, then its source is matched to the parentName that is provided
NOTE: As the function manipulates omaps, it should be called with a lock against the request name
held, to prevent parallel operations from modifying the state of the omaps for this request name.
Return values:
- string: Contains the UUID that was reserved for the passed in reqName, empty if
there was no reservation found
- error: non-nil in case of any errors
*/
func (cj *CSIJournal) CheckReservation(monitors, id, key, pool, reqName, parentName string) (string, error) {
var snapSource bool
if parentName != "" {
if cj.cephSnapSourceKey == "" {
err := errors.New("invalid request, cephSnapSourceKey is nil")
return "", err
}
snapSource = true
}
// check if request name is already part of the directory omap
objUUID, err := GetOMapValue(monitors, id, key, pool, cj.csiDirectory,
cj.csiNameKeyPrefix+reqName)
if err != nil {
// error should specifically be not found, for volume to be absent, any other error
// is not conclusive, and we should not proceed
if _, ok := err.(ErrKeyNotFound); ok {
return "", nil
}
return "", err
}
savedReqName, savedReqParentName, err := cj.GetObjectUUIDData(monitors, id, key, pool,
objUUID, snapSource)
if err != nil {
// error should specifically be not found, for image to be absent, any other error
// is not conclusive, and we should not proceed
if _, ok := err.(ErrKeyNotFound); ok {
err = cj.UndoReservation(monitors, id, key, pool, cj.namingPrefix+objUUID, reqName)
}
return "", err
}
// check if UUID key points back to the request name
if savedReqName != reqName {
// NOTE: This should never be possible, hence no cleanup, but log error
// and return, as cleanup may need to occur manually!
return "", fmt.Errorf("internal state inconsistent, omap names mismatch,"+
" request name (%s) volume UUID (%s) volume omap name (%s)",
reqName, objUUID, savedReqName)
}
if snapSource {
// check if source UUID key points back to the parent volume passed in
if savedReqParentName != parentName {
// NOTE: This can happen if there is a snapname conflict, and we already have a snapshot
// with the same name pointing to a different UUID as the source
err = fmt.Errorf("snapname points to different volume, request name (%s)"+
" source name (%s) saved source name (%s)",
reqName, parentName, savedReqParentName)
return "", ErrSnapNameConflict{reqName, err}
}
}
return objUUID, nil
}
/*
UndoReservation undoes a reservation, in the reverse order of ReserveName
- The UUID directory is cleaned up before the VolName key in the csiDirectory is cleaned up
NOTE: Ensure that the Ceph volume (image or FS subvolume) backing the reservation is cleaned up
prior to cleaning up the reservation
NOTE: As the function manipulates omaps, it should be called with a lock against the request name
held, to prevent parallel operations from modifying the state of the omaps for this request name.
*/
func (cj *CSIJournal) UndoReservation(monitors, id, key, pool, volName, reqName string) error {
// delete volume UUID omap (first, inverse of create order)
// TODO: Check cases where volName can be empty, and we need to just cleanup the reqName
imageUUID := strings.TrimPrefix(volName, cj.namingPrefix)
err := RemoveObject(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+imageUUID)
if err != nil {
if _, ok := err.(ErrObjectNotFound); !ok {
klog.Errorf("failed removing oMap %s (%s)", cj.cephUUIDDirectoryPrefix+imageUUID, err)
return err
}
}
// delete the request name key (last, inverse of create order)
err = RemoveOMapKey(monitors, id, key, pool, cj.csiDirectory,
cj.csiNameKeyPrefix+reqName)
if err != nil {
klog.Errorf("failed removing oMap key %s (%s)", cj.csiNameKeyPrefix+reqName, err)
return err
}
return err
}
// reserveOMapName creates an omap with passed in oMapNamePrefix and a generated <uuid>.
// It ensures generated omap name does not already exist and if conflicts are detected, a set
// number of retires with newer uuids are attempted before returning an error
func reserveOMapName(monitors, id, key, pool, oMapNamePrefix string) (string, error) {
var iterUUID string
maxAttempts := 5
attempt := 1
for attempt <= maxAttempts {
// generate a uuid for the image name
iterUUID = uuid.NewUUID().String()
err := CreateObject(monitors, id, key, pool, oMapNamePrefix+iterUUID)
if err != nil {
if _, ok := err.(ErrObjectExists); ok {
attempt++
// try again with a different uuid, for maxAttempts tries
klog.V(4).Infof("uuid (%s) conflict detected, retrying (attempt %d of %d)",
iterUUID, attempt, maxAttempts)
continue
}
return "", err
}
return iterUUID, nil
}
return "", errors.New("uuid conflicts exceeds retry threshold")
}
/*
ReserveName adds respective entries to the csiDirectory omaps, post generating a target
UUIDDirectory for use. Further, these functions update the UUIDDirectory omaps, to store back
pointers to the CSI generated request names.
NOTE: As the function manipulates omaps, it should be called with a lock against the request name
held, to prevent parallel operations from modifying the state of the omaps for this request name.
Return values:
- string: Contains the UUID that was reserved for the passed in reqName
- error: non-nil in case of any errors
*/
func (cj *CSIJournal) ReserveName(monitors, id, key, pool, reqName, parentName string) (string, error) {
var snapSource bool
if parentName != "" {
if cj.cephSnapSourceKey == "" {
err := errors.New("invalid request, cephSnapSourceKey is nil")
return "", err
}
snapSource = true
}
// Create the UUID based omap first, to reserve the same and avoid conflicts
// NOTE: If any service loss occurs post creation of the UUID directory, and before
// setting the request name key (csiNameKey) to point back to the UUID directory, the
// UUID directory key will be leaked
volUUID, err := reserveOMapName(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix)
if err != nil {
return "", err
}
// Create request name (csiNameKey) key in csiDirectory and store the UUId based
// volume name into it
err = SetOMapKeyValue(monitors, id, key, pool, cj.csiDirectory, cj.csiNameKeyPrefix+reqName,
volUUID)
if err != nil {
return "", err
}
defer func() {
if err != nil {
klog.Warningf("reservation failed for volume: %s", reqName)
errDefer := cj.UndoReservation(monitors, id, key, pool, cj.namingPrefix+volUUID,
reqName)
if errDefer != nil {
klog.Warningf("failed undoing reservation of volume: %s (%v)", reqName, errDefer)
}
}
}()
// Update UUID directory to store CSI request name
err = SetOMapKeyValue(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+volUUID,
cj.csiNameKey, reqName)
if err != nil {
return "", err
}
if snapSource {
// Update UUID directory to store source volume UUID in case of snapshots
err = SetOMapKeyValue(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+volUUID,
cj.cephSnapSourceKey, parentName)
if err != nil {
return "", err
}
}
return volUUID, nil
}
/*
GetObjectUUIDData fetches all keys from a UUID directory
Return values:
- string: Contains the request name for the passed in UUID
- string: Contains the parent image name for the passed in UUID, if it is a snapshot
- error: non-nil in case of any errors
*/
func (cj *CSIJournal) GetObjectUUIDData(monitors, id, key, pool, objectUUID string, snapSource bool) (string, string, error) {
var sourceName string
if snapSource && cj.cephSnapSourceKey == "" {
err := errors.New("invalid request, cephSnapSourceKey is nil")
return "", "", err
}
// TODO: fetch all omap vals in one call, than make multiple listomapvals
requestName, err := GetOMapValue(monitors, id, key, pool,
cj.cephUUIDDirectoryPrefix+objectUUID, cj.csiNameKey)
if err != nil {
return "", "", err
}
if snapSource {
sourceName, err = GetOMapValue(monitors, id, key, pool,
cj.cephUUIDDirectoryPrefix+objectUUID, cj.cephSnapSourceKey)
if err != nil {
return "", "", err
}
}
return requestName, sourceName, nil
}