mirror of
https://github.com/ceph/ceph-csi.git
synced 2024-12-18 02:50:30 +00:00
util: added logs for slow gRPC calls
This commit adds a gRPC middleware that logs calls that keep running after their deadline. Adds --logslowopinterval cmdline argument to pass the log rate. Signed-off-by: Robert Vasek <robert.vasek@clyso.com>
This commit is contained in:
parent
56d08e1b4d
commit
7a727c2a43
@ -120,6 +120,11 @@ func init() {
|
|||||||
"path of prometheus endpoint where metrics will be available")
|
"path of prometheus endpoint where metrics will be available")
|
||||||
flag.DurationVar(&conf.PollTime, "polltime", time.Second*pollTime, "time interval in seconds between each poll")
|
flag.DurationVar(&conf.PollTime, "polltime", time.Second*pollTime, "time interval in seconds between each poll")
|
||||||
flag.DurationVar(&conf.PoolTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds")
|
flag.DurationVar(&conf.PoolTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds")
|
||||||
|
flag.DurationVar(
|
||||||
|
&conf.LogSlowOpInterval,
|
||||||
|
"logslowopinterval",
|
||||||
|
time.Second*30,
|
||||||
|
"how often to inform about slow gRPC calls")
|
||||||
|
|
||||||
flag.UintVar(
|
flag.UintVar(
|
||||||
&conf.RbdHardMaxCloneDepth,
|
&conf.RbdHardMaxCloneDepth,
|
||||||
|
@ -199,7 +199,9 @@ func (fs *Driver) Run(conf *util.Config) {
|
|||||||
NS: fs.ns,
|
NS: fs.ns,
|
||||||
GS: fs.cs,
|
GS: fs.cs,
|
||||||
}
|
}
|
||||||
server.Start(conf.Endpoint, srv)
|
server.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{
|
||||||
|
LogSlowOpInterval: conf.LogSlowOpInterval,
|
||||||
|
})
|
||||||
|
|
||||||
if conf.EnableProfiling {
|
if conf.EnableProfiling {
|
||||||
go util.StartMetricsServer(conf)
|
go util.StartMetricsServer(conf)
|
||||||
@ -230,7 +232,9 @@ func (fs *Driver) setupCSIAddonsServer(conf *util.Config) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// start the server, this does not block, it runs a new go-routine
|
// start the server, this does not block, it runs a new go-routine
|
||||||
err = fs.cas.Start()
|
err = fs.cas.Start(csicommon.MiddlewareServerOptionConfig{
|
||||||
|
LogSlowOpInterval: conf.LogSlowOpInterval,
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to start CSI-Addons server: %w", err)
|
return fmt.Errorf("failed to start CSI-Addons server: %w", err)
|
||||||
}
|
}
|
||||||
|
@ -85,9 +85,9 @@ func (cas *CSIAddonsServer) RegisterService(svc CSIAddonsService) {
|
|||||||
// Start creates the internal gRPC server, and registers the CSIAddonsServices.
|
// Start creates the internal gRPC server, and registers the CSIAddonsServices.
|
||||||
// The internal gRPC server is started in it's own go-routine when no error is
|
// The internal gRPC server is started in it's own go-routine when no error is
|
||||||
// returned.
|
// returned.
|
||||||
func (cas *CSIAddonsServer) Start() error {
|
func (cas *CSIAddonsServer) Start(middlewareConfig csicommon.MiddlewareServerOptionConfig) error {
|
||||||
// create the gRPC server and register services
|
// create the gRPC server and register services
|
||||||
cas.server = grpc.NewServer(csicommon.NewMiddlewareServerOption())
|
cas.server = grpc.NewServer(csicommon.NewMiddlewareServerOption(middlewareConfig))
|
||||||
|
|
||||||
for _, svc := range cas.services {
|
for _, svc := range cas.services {
|
||||||
svc.RegisterService(cas.server)
|
svc.RegisterService(cas.server)
|
||||||
|
@ -31,7 +31,7 @@ import (
|
|||||||
// NonBlockingGRPCServer defines Non blocking GRPC server interfaces.
|
// NonBlockingGRPCServer defines Non blocking GRPC server interfaces.
|
||||||
type NonBlockingGRPCServer interface {
|
type NonBlockingGRPCServer interface {
|
||||||
// Start services at the endpoint
|
// Start services at the endpoint
|
||||||
Start(endpoint string, srv Servers)
|
Start(endpoint string, srv Servers, middlewareConfig MiddlewareServerOptionConfig)
|
||||||
// Waits for the service to stop
|
// Waits for the service to stop
|
||||||
Wait()
|
Wait()
|
||||||
// Stops the service gracefully
|
// Stops the service gracefully
|
||||||
@ -60,9 +60,13 @@ type nonBlockingGRPCServer struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start start service on endpoint.
|
// Start start service on endpoint.
|
||||||
func (s *nonBlockingGRPCServer) Start(endpoint string, srv Servers) {
|
func (s *nonBlockingGRPCServer) Start(
|
||||||
|
endpoint string,
|
||||||
|
srv Servers,
|
||||||
|
middlewareConfig MiddlewareServerOptionConfig,
|
||||||
|
) {
|
||||||
s.wg.Add(1)
|
s.wg.Add(1)
|
||||||
go s.serve(endpoint, srv)
|
go s.serve(endpoint, srv, middlewareConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait blocks until the WaitGroup counter.
|
// Wait blocks until the WaitGroup counter.
|
||||||
@ -80,7 +84,11 @@ func (s *nonBlockingGRPCServer) ForceStop() {
|
|||||||
s.server.Stop()
|
s.server.Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *nonBlockingGRPCServer) serve(endpoint string, srv Servers) {
|
func (s *nonBlockingGRPCServer) serve(
|
||||||
|
endpoint string,
|
||||||
|
srv Servers,
|
||||||
|
middlewareConfig MiddlewareServerOptionConfig,
|
||||||
|
) {
|
||||||
proto, addr, err := parseEndpoint(endpoint)
|
proto, addr, err := parseEndpoint(endpoint)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Fatal(err.Error())
|
klog.Fatal(err.Error())
|
||||||
@ -98,7 +106,7 @@ func (s *nonBlockingGRPCServer) serve(endpoint string, srv Servers) {
|
|||||||
klog.Fatalf("Failed to listen: %v", err)
|
klog.Fatalf("Failed to listen: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
server := grpc.NewServer(NewMiddlewareServerOption())
|
server := grpc.NewServer(NewMiddlewareServerOption(middlewareConfig))
|
||||||
s.server = server
|
s.server = server
|
||||||
|
|
||||||
if srv.IS != nil {
|
if srv.IS != nil {
|
||||||
|
@ -23,6 +23,7 @@ import (
|
|||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/ceph/ceph-csi/internal/util"
|
"github.com/ceph/ceph-csi/internal/util"
|
||||||
"github.com/ceph/ceph-csi/internal/util/log"
|
"github.com/ceph/ceph-csi/internal/util/log"
|
||||||
@ -108,10 +109,35 @@ func NewGroupControllerServiceCapability(ctrlCap csi.GroupControllerServiceCapab
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MiddlewareServerOptionConfig contains configuration parameters
|
||||||
|
// that are passed to the respective middleware interceptors that
|
||||||
|
// are instantiated when starting gRPC servers.
|
||||||
|
type MiddlewareServerOptionConfig struct {
|
||||||
|
LogSlowOpInterval time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
// NewMiddlewareServerOption creates a new grpc.ServerOption that configures a
|
// NewMiddlewareServerOption creates a new grpc.ServerOption that configures a
|
||||||
// common format for log messages and other gRPC related handlers.
|
// common format for log messages and other gRPC related handlers.
|
||||||
func NewMiddlewareServerOption() grpc.ServerOption {
|
func NewMiddlewareServerOption(config MiddlewareServerOptionConfig) grpc.ServerOption {
|
||||||
middleWare := []grpc.UnaryServerInterceptor{contextIDInjector, logGRPC, panicHandler}
|
middleWare := []grpc.UnaryServerInterceptor{
|
||||||
|
contextIDInjector,
|
||||||
|
logGRPC,
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.LogSlowOpInterval > 0 {
|
||||||
|
middleWare = append(middleWare, func(
|
||||||
|
ctx context.Context,
|
||||||
|
req interface{},
|
||||||
|
info *grpc.UnaryServerInfo,
|
||||||
|
handler grpc.UnaryHandler,
|
||||||
|
) (interface{}, error) {
|
||||||
|
return logSlowGRPC(
|
||||||
|
config.LogSlowOpInterval, ctx, req, info, handler,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
middleWare = append(middleWare, panicHandler)
|
||||||
|
|
||||||
return grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(middleWare...))
|
return grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(middleWare...))
|
||||||
}
|
}
|
||||||
@ -250,6 +276,53 @@ func logGRPC(
|
|||||||
return resp, err
|
return resp, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func logSlowGRPC(
|
||||||
|
logInterval time.Duration,
|
||||||
|
ctx context.Context,
|
||||||
|
req interface{},
|
||||||
|
info *grpc.UnaryServerInfo,
|
||||||
|
handler grpc.UnaryHandler,
|
||||||
|
) (interface{}, error) {
|
||||||
|
handlerFinished := make(chan struct{})
|
||||||
|
callStartTime := time.Now()
|
||||||
|
|
||||||
|
// Ticks at a logInterval rate and logs a slow-call message until handler finishes.
|
||||||
|
// This is called once the handler outlives its context, see below.
|
||||||
|
doLogSlowGRPC := func() {
|
||||||
|
ticker := time.NewTicker(logInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case t := <-ticker.C:
|
||||||
|
timePassed := t.Sub(callStartTime).Truncate(time.Second)
|
||||||
|
log.ExtendedLog(ctx,
|
||||||
|
"Slow GRPC call %s (%s)", info.FullMethod, timePassed)
|
||||||
|
log.TraceLog(ctx,
|
||||||
|
"Slow GRPC request: %s", protosanitizer.StripSecrets(req))
|
||||||
|
case <-handlerFinished:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
// The call (most likely) outlived its context. Start logging slow messages.
|
||||||
|
doLogSlowGRPC()
|
||||||
|
case <-handlerFinished:
|
||||||
|
// The call finished, exit.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
resp, err := handler(ctx, req)
|
||||||
|
close(handlerFinished)
|
||||||
|
|
||||||
|
return resp, err
|
||||||
|
}
|
||||||
|
|
||||||
//nolint:nonamedreturns // named return used to send recovered panic error.
|
//nolint:nonamedreturns // named return used to send recovered panic error.
|
||||||
func panicHandler(
|
func panicHandler(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
|
@ -77,7 +77,10 @@ func (fs *Driver) Run(conf *util.Config) {
|
|||||||
srv.CS = controller.NewControllerServer(cd)
|
srv.CS = controller.NewControllerServer(cd)
|
||||||
}
|
}
|
||||||
|
|
||||||
server.Start(conf.Endpoint, srv)
|
server.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{
|
||||||
|
LogSlowOpInterval: conf.LogSlowOpInterval,
|
||||||
|
})
|
||||||
|
|
||||||
if conf.EnableProfiling {
|
if conf.EnableProfiling {
|
||||||
go util.StartMetricsServer(conf)
|
go util.StartMetricsServer(conf)
|
||||||
log.DebugLogMsg("Registering profiling handler")
|
log.DebugLogMsg("Registering profiling handler")
|
||||||
|
@ -179,7 +179,9 @@ func (r *Driver) Run(conf *util.Config) {
|
|||||||
CS: r.cs,
|
CS: r.cs,
|
||||||
NS: r.ns,
|
NS: r.ns,
|
||||||
}
|
}
|
||||||
s.Start(conf.Endpoint, srv)
|
s.Start(conf.Endpoint, srv, csicommon.MiddlewareServerOptionConfig{
|
||||||
|
LogSlowOpInterval: conf.LogSlowOpInterval,
|
||||||
|
})
|
||||||
|
|
||||||
r.startProfiling(conf)
|
r.startProfiling(conf)
|
||||||
|
|
||||||
@ -233,7 +235,9 @@ func (r *Driver) setupCSIAddonsServer(conf *util.Config) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// start the server, this does not block, it runs a new go-routine
|
// start the server, this does not block, it runs a new go-routine
|
||||||
err = r.cas.Start()
|
err = r.cas.Start(csicommon.MiddlewareServerOptionConfig{
|
||||||
|
LogSlowOpInterval: conf.LogSlowOpInterval,
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to start CSI-Addons server: %w", err)
|
return fmt.Errorf("failed to start CSI-Addons server: %w", err)
|
||||||
}
|
}
|
||||||
|
@ -131,6 +131,9 @@ type Config struct {
|
|||||||
MetricsPort int // TCP port for liveness/grpc metrics requests
|
MetricsPort int // TCP port for liveness/grpc metrics requests
|
||||||
PollTime time.Duration // time interval in seconds between each poll
|
PollTime time.Duration // time interval in seconds between each poll
|
||||||
PoolTimeout time.Duration // probe timeout in seconds
|
PoolTimeout time.Duration // probe timeout in seconds
|
||||||
|
// Log interval for slow GRPC calls. Calls that outlive their context deadline
|
||||||
|
// are considered slow.
|
||||||
|
LogSlowOpInterval time.Duration
|
||||||
|
|
||||||
EnableProfiling bool // flag to enable profiling
|
EnableProfiling bool // flag to enable profiling
|
||||||
IsControllerServer bool // if set to true start provisioner server
|
IsControllerServer bool // if set to true start provisioner server
|
||||||
|
Loading…
Reference in New Issue
Block a user