Move locks to more granular locking than CPU count based

As detailed in issue #279, current lock scheme has hash
buckets that are count of CPUs. This causes a lot of contention
when parallel requests are made to the CSI plugin. To reduce
lock contention, this commit introduces granular locks per
identifier.

The commit also changes the timeout for gRPC requests to Create
and Delete volumes, as the current timeout is 10s (kubernetes
documentation says 15s but code defaults are 10s). A virtual
setup takes about 12-15s to complete a request at times, that leads
to unwanted retries of the same request, hence the increased
timeout to enable operation completion with minimal retries.

Tests to create PVCs before and after these changes look like so,

Before:
Default master code + sidecar provisioner --timeout option set
to 30 seconds

20 PVCs
Creation: 3 runs, 396/391/400 seconds
Deletion: 3 runs, 218/271/118 seconds
  - Once was stalled for more than 8 minutes and cancelled the run

After:
Current commit + sidecar provisioner --timeout option set to 30 sec
20 PVCs
Creation: 3 runs, 42/59/65 seconds
Deletion: 3 runs, 32/32/31 seconds

Fixes: #279
Signed-off-by: ShyamsundarR <srangana@redhat.com>
This commit is contained in:
ShyamsundarR
2019-06-22 12:43:28 -04:00
committed by mergify[bot]
parent bc39c523b7
commit c4a3675cec
13 changed files with 173 additions and 104 deletions

View File

@ -62,18 +62,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
return nil, status.Error(codes.Internal, err.Error())
}
targetPathMutex.LockKey(targetPath)
defer func() {
if err = targetPathMutex.UnlockKey(targetPath); err != nil {
klog.Warningf("failed to unlock mutex targetpath:%s %v", targetPath, err)
}
}()
disableInUseChecks := false
idLk := targetPathLocker.Lock(targetPath)
defer targetPathLocker.Unlock(idLk, targetPath)
volName, err := ns.getVolumeName(req)
if err != nil {
return nil, err
}
disableInUseChecks := false
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check if that target path exists properly
@ -100,7 +92,13 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
if err != nil {
return nil, err
}
volName, err := ns.getVolumeName(req)
if err != nil {
return nil, err
}
volOptions.RbdImageName = volName
// Mapping RBD image
devicePath, err := attachRBDImage(volOptions, cr)
if err != nil {
@ -206,13 +204,8 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
return nil, status.Error(codes.InvalidArgument, "empty volume ID in request")
}
targetPathMutex.LockKey(targetPath)
defer func() {
if err := targetPathMutex.UnlockKey(targetPath); err != nil {
klog.Warningf("failed to unlock mutex targetpath:%s %v", targetPath, err)
}
}()
idLk := targetPathLocker.Lock(targetPath)
defer targetPathLocker.Unlock(idLk, targetPath)
notMnt, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if err != nil {