build: move e2e dependencies into e2e/go.mod

Several packages are only used while running the e2e suite. These
packages are less important to update, as the they can not influence the
final executable that is part of the Ceph-CSI container-image.

By moving these dependencies out of the main Ceph-CSI go.mod, it is
easier to identify if a reported CVE affects Ceph-CSI, or only the
testing (like most of the Kubernetes CVEs).

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-03-04 08:57:28 +01:00
committed by mergify[bot]
parent 15da101b1b
commit bec6090996
8047 changed files with 1407827 additions and 3453 deletions

View File

@ -0,0 +1,564 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package client
import (
"context"
"errors"
"fmt"
"io"
"math/rand"
"net"
"sync"
"sync/atomic"
"time"
"google.golang.org/grpc"
"k8s.io/klog/v2"
"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client/metrics"
commonmetrics "sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/common/metrics"
"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/proto/client"
)
// Tunnel provides ability to dial a connection through a tunnel.
type Tunnel interface {
// Dial connects to the address on the named network, similar to
// what net.Dial does. The only supported protocol is tcp.
DialContext(requestCtx context.Context, protocol, address string) (net.Conn, error)
// Done returns a channel that is closed when the tunnel is no longer serving any connections,
// and can no longer be used.
Done() <-chan struct{}
}
type dialResult struct {
err *dialFailure
connid int64
}
type pendingDial struct {
// resultCh is the channel to send the dial result to
resultCh chan<- dialResult
// cancelCh is the channel closed when resultCh no longer has a receiver
cancelCh <-chan struct{}
}
// TODO: Replace with a generic implementation once it is safe to assume the client is built with go1.18+
type pendingDialManager struct {
pendingDials map[int64]pendingDial
mutex sync.RWMutex
}
func (p *pendingDialManager) add(dialID int64, pd pendingDial) {
p.mutex.Lock()
defer p.mutex.Unlock()
p.pendingDials[dialID] = pd
}
func (p *pendingDialManager) remove(dialID int64) {
p.mutex.Lock()
defer p.mutex.Unlock()
delete(p.pendingDials, dialID)
}
func (p *pendingDialManager) get(dialID int64) (pendingDial, bool) {
p.mutex.RLock()
defer p.mutex.RUnlock()
pd, ok := p.pendingDials[dialID]
return pd, ok
}
// TODO: Replace with a generic implementation once it is safe to assume the client is built with go1.18+
type connectionManager struct {
conns map[int64]*conn
mutex sync.RWMutex
}
func (cm *connectionManager) add(connID int64, c *conn) {
cm.mutex.Lock()
defer cm.mutex.Unlock()
cm.conns[connID] = c
}
func (cm *connectionManager) remove(connID int64) {
cm.mutex.Lock()
defer cm.mutex.Unlock()
delete(cm.conns, connID)
}
func (cm *connectionManager) get(connID int64) (*conn, bool) {
cm.mutex.RLock()
defer cm.mutex.RUnlock()
c, ok := cm.conns[connID]
return c, ok
}
func (cm *connectionManager) closeAll() {
cm.mutex.Lock()
defer cm.mutex.Unlock()
for _, conn := range cm.conns {
close(conn.readCh)
}
}
// grpcTunnel implements Tunnel
type grpcTunnel struct {
stream client.ProxyService_ProxyClient
sendLock sync.Mutex
recvLock sync.Mutex
grpcConn clientConn
pendingDial pendingDialManager
conns connectionManager
// The tunnel will be closed if the caller fails to read via conn.Read()
// more than readTimeoutSeconds after a packet has been received.
readTimeoutSeconds int
// The done channel is closed after the tunnel has cleaned up all connections and is no longer
// serving.
done chan struct{}
// started is an atomic bool represented as a 0 or 1, and set to true when a single-use tunnel has been started (dialed).
// started should only be accessed through atomic methods.
// TODO: switch this to an atomic.Bool once the client is exclusively buit with go1.19+
started uint32
// closing is an atomic bool represented as a 0 or 1, and set to true when the tunnel is being closed.
// closing should only be accessed through atomic methods.
// TODO: switch this to an atomic.Bool once the client is exclusively buit with go1.19+
closing uint32
// Stores the current metrics.ClientConnectionStatus
prevStatus atomic.Value
}
type clientConn interface {
Close() error
}
var _ clientConn = &grpc.ClientConn{}
var (
// Expose metrics for client to register.
Metrics = metrics.Metrics
)
// CreateSingleUseGrpcTunnel creates a Tunnel to dial to a remote server through a
// gRPC based proxy service.
// Currently, a single tunnel supports a single connection, and the tunnel is closed when the connection is terminated
// The Dial() method of the returned tunnel should only be called once
// Deprecated 2022-06-07: use CreateSingleUseGrpcTunnelWithContext
func CreateSingleUseGrpcTunnel(tunnelCtx context.Context, address string, opts ...grpc.DialOption) (Tunnel, error) {
return CreateSingleUseGrpcTunnelWithContext(context.TODO(), tunnelCtx, address, opts...)
}
// CreateSingleUseGrpcTunnelWithContext creates a Tunnel to dial to a remote server through a
// gRPC based proxy service.
// Currently, a single tunnel supports a single connection.
// The tunnel is normally closed when the connection is terminated.
// If createCtx is cancelled before tunnel creation, an error will be returned.
// If tunnelCtx is cancelled while the tunnel is still in use, the tunnel (and any in flight connections) will be closed.
// The Dial() method of the returned tunnel should only be called once
func CreateSingleUseGrpcTunnelWithContext(createCtx, tunnelCtx context.Context, address string, opts ...grpc.DialOption) (Tunnel, error) {
c, err := grpc.DialContext(createCtx, address, opts...)
if err != nil {
return nil, err
}
grpcClient := client.NewProxyServiceClient(c)
stream, err := grpcClient.Proxy(tunnelCtx)
if err != nil {
c.Close()
return nil, err
}
tunnel := newUnstartedTunnel(stream, c)
go tunnel.serve(tunnelCtx)
return tunnel, nil
}
func newUnstartedTunnel(stream client.ProxyService_ProxyClient, c clientConn) *grpcTunnel {
t := grpcTunnel{
stream: stream,
grpcConn: c,
pendingDial: pendingDialManager{pendingDials: make(map[int64]pendingDial)},
conns: connectionManager{conns: make(map[int64]*conn)},
readTimeoutSeconds: 10,
done: make(chan struct{}),
started: 0,
}
s := metrics.ClientConnectionStatusCreated
t.prevStatus.Store(s)
metrics.Metrics.GetClientConnectionsMetric().WithLabelValues(string(s)).Inc()
return &t
}
func (t *grpcTunnel) updateMetric(status metrics.ClientConnectionStatus) {
select {
case <-t.Done():
return
default:
}
prevStatus := t.prevStatus.Swap(status).(metrics.ClientConnectionStatus)
m := metrics.Metrics.GetClientConnectionsMetric()
m.WithLabelValues(string(prevStatus)).Dec()
m.WithLabelValues(string(status)).Inc()
}
// closeMetric should be called exactly once to finalize client_connections metric.
func (t *grpcTunnel) closeMetric() {
select {
case <-t.Done():
return
default:
}
prevStatus := t.prevStatus.Load().(metrics.ClientConnectionStatus)
metrics.Metrics.GetClientConnectionsMetric().WithLabelValues(string(prevStatus)).Dec()
}
func (t *grpcTunnel) serve(tunnelCtx context.Context) {
defer func() {
t.grpcConn.Close()
// A connection in t.conns after serve() returns means
// we never received a CLOSE_RSP for it, so we need to
// close any channels remaining for these connections.
t.conns.closeAll()
t.closeMetric()
close(t.done)
}()
for {
pkt, err := t.Recv()
if err == io.EOF {
return
}
isClosing := t.isClosing()
if err != nil || pkt == nil {
if !isClosing {
klog.ErrorS(err, "stream read failure")
}
return
}
if isClosing {
return
}
klog.V(5).InfoS("[tracing] recv packet", "type", pkt.Type)
switch pkt.Type {
case client.PacketType_DIAL_RSP:
resp := pkt.GetDialResponse()
pendingDial, ok := t.pendingDial.get(resp.Random)
if !ok {
// If the DIAL_RSP does not match a pending dial, it means one of two things:
// 1. There was a second DIAL_RSP for the connection request (this is very unlikely but possible)
// 2. grpcTunnel.DialContext() returned early due to a dial timeout or the client canceling the context
//
// In either scenario, we should return here and close the tunnel as it is no longer needed.
kvs := []interface{}{"dialID", resp.Random, "connectionID", resp.ConnectID}
if resp.Error != "" {
kvs = append(kvs, "error", resp.Error)
}
klog.V(1).InfoS("DialResp not recognized; dropped", kvs...)
return
}
result := dialResult{connid: resp.ConnectID}
if resp.Error != "" {
result.err = &dialFailure{resp.Error, metrics.DialFailureEndpoint}
} else {
t.updateMetric(metrics.ClientConnectionStatusOk)
}
select {
// try to send to the result channel
case pendingDial.resultCh <- result:
// unblock if the cancel channel is closed
case <-pendingDial.cancelCh:
// Note: this condition can only be hit by a race condition where the
// DialContext() returns early (timeout) after the pendingDial is already
// fetched here, but before the result is sent.
klog.V(1).InfoS("Pending dial has been cancelled; dropped", "connectionID", resp.ConnectID, "dialID", resp.Random)
return
case <-tunnelCtx.Done():
klog.V(1).InfoS("Tunnel has been closed; dropped", "connectionID", resp.ConnectID, "dialID", resp.Random)
return
}
if resp.Error != "" {
// On dial error, avoid leaking serve goroutine.
return
}
case client.PacketType_DIAL_CLS:
resp := pkt.GetCloseDial()
pendingDial, ok := t.pendingDial.get(resp.Random)
if !ok {
// If the DIAL_CLS does not match a pending dial, it means one of two things:
// 1. There was a DIAL_CLS receieved after a DIAL_RSP (unlikely but possible)
// 2. grpcTunnel.DialContext() returned early due to a dial timeout or the client canceling the context
//
// In either scenario, we should return here and close the tunnel as it is no longer needed.
klog.V(1).InfoS("DIAL_CLS after dial finished", "dialID", resp.Random)
} else {
result := dialResult{
err: &dialFailure{"dial closed", metrics.DialFailureDialClosed},
}
select {
case pendingDial.resultCh <- result:
case <-pendingDial.cancelCh:
// Note: this condition can only be hit by a race condition where the
// DialContext() returns early (timeout) after the pendingDial is already
// fetched here, but before the result is sent.
case <-tunnelCtx.Done():
}
}
return // Stop serving & close the tunnel.
case client.PacketType_DATA:
resp := pkt.GetData()
if resp.ConnectID == 0 {
klog.ErrorS(nil, "Received packet missing ConnectID", "packetType", "DATA")
continue
}
// TODO: flow control
conn, ok := t.conns.get(resp.ConnectID)
if !ok {
klog.ErrorS(nil, "Connection not recognized", "connectionID", resp.ConnectID, "packetType", "DATA")
t.sendCloseRequest(resp.ConnectID)
continue
}
timer := time.NewTimer((time.Duration)(t.readTimeoutSeconds) * time.Second)
select {
case conn.readCh <- resp.Data:
timer.Stop()
case <-timer.C:
klog.ErrorS(fmt.Errorf("timeout"), "readTimeout has been reached, the grpc connection to the proxy server will be closed", "connectionID", conn.connID, "readTimeoutSeconds", t.readTimeoutSeconds)
return
case <-tunnelCtx.Done():
klog.V(1).InfoS("Tunnel has been closed, the grpc connection to the proxy server will be closed", "connectionID", conn.connID)
}
case client.PacketType_CLOSE_RSP:
resp := pkt.GetCloseResponse()
conn, ok := t.conns.get(resp.ConnectID)
if !ok {
klog.V(1).InfoS("Connection not recognized", "connectionID", resp.ConnectID, "packetType", "CLOSE_RSP")
continue
}
close(conn.readCh)
conn.closeCh <- resp.Error
close(conn.closeCh)
t.conns.remove(resp.ConnectID)
return
}
}
}
// Dial connects to the address on the named network, similar to
// what net.Dial does. The only supported protocol is tcp.
func (t *grpcTunnel) DialContext(requestCtx context.Context, protocol, address string) (net.Conn, error) {
conn, err := t.dialContext(requestCtx, protocol, address)
if err != nil {
_, reason := GetDialFailureReason(err)
metrics.Metrics.ObserveDialFailure(reason)
}
return conn, err
}
func (t *grpcTunnel) dialContext(requestCtx context.Context, protocol, address string) (net.Conn, error) {
prevStarted := atomic.SwapUint32(&t.started, 1)
if prevStarted != 0 {
return nil, &dialFailure{"single-use dialer already dialed", metrics.DialFailureAlreadyStarted}
}
select {
case <-t.done:
return nil, errors.New("tunnel is closed")
default: // Tunnel is open, carry on.
}
if protocol != "tcp" {
return nil, errors.New("protocol not supported")
}
t.updateMetric(metrics.ClientConnectionStatusDialing)
random := rand.Int63() /* #nosec G404 */
// This channel is closed once we're returning and no longer waiting on resultCh
cancelCh := make(chan struct{})
defer close(cancelCh)
// This channel MUST NOT be buffered. The sender needs to know when we are not receiving things, so they can abort.
resCh := make(chan dialResult)
t.pendingDial.add(random, pendingDial{resultCh: resCh, cancelCh: cancelCh})
defer t.pendingDial.remove(random)
req := &client.Packet{
Type: client.PacketType_DIAL_REQ,
Payload: &client.Packet_DialRequest{
DialRequest: &client.DialRequest{
Protocol: protocol,
Address: address,
Random: random,
},
},
}
klog.V(5).InfoS("[tracing] send packet", "type", req.Type)
err := t.Send(req)
if err != nil {
return nil, err
}
klog.V(5).Infoln("DIAL_REQ sent to proxy server")
c := &conn{
tunnel: t,
random: random,
}
select {
case res := <-resCh:
if res.err != nil {
return nil, res.err
}
c.connID = res.connid
c.readCh = make(chan []byte, 10)
c.closeCh = make(chan string, 1)
t.conns.add(res.connid, c)
case <-time.After(30 * time.Second):
klog.V(5).InfoS("Timed out waiting for DialResp", "dialID", random)
go func() {
defer t.closeTunnel()
t.sendDialClose(random)
}()
return nil, &dialFailure{"dial timeout, backstop", metrics.DialFailureTimeout}
case <-requestCtx.Done():
klog.V(5).InfoS("Context canceled waiting for DialResp", "ctxErr", requestCtx.Err(), "dialID", random)
go func() {
defer t.closeTunnel()
t.sendDialClose(random)
}()
return nil, &dialFailure{"dial timeout, context", metrics.DialFailureContext}
case <-t.done:
klog.V(5).InfoS("Tunnel closed while waiting for DialResp", "dialID", random)
return nil, &dialFailure{"tunnel closed", metrics.DialFailureTunnelClosed}
}
return c, nil
}
func (t *grpcTunnel) Done() <-chan struct{} {
return t.done
}
// Send a best-effort DIAL_CLS request for the given dial ID.
func (t *grpcTunnel) sendCloseRequest(connID int64) error {
req := &client.Packet{
Type: client.PacketType_CLOSE_REQ,
Payload: &client.Packet_CloseRequest{
CloseRequest: &client.CloseRequest{
ConnectID: connID,
},
},
}
klog.V(5).InfoS("[tracing] send req", "type", req.Type)
return t.Send(req)
}
func (t *grpcTunnel) sendDialClose(dialID int64) error {
req := &client.Packet{
Type: client.PacketType_DIAL_CLS,
Payload: &client.Packet_CloseDial{
CloseDial: &client.CloseDial{
Random: dialID,
},
},
}
klog.V(5).InfoS("[tracing] send req", "type", req.Type)
return t.Send(req)
}
func (t *grpcTunnel) closeTunnel() {
atomic.StoreUint32(&t.closing, 1)
t.grpcConn.Close()
}
func (t *grpcTunnel) isClosing() bool {
return atomic.LoadUint32(&t.closing) != 0
}
func (t *grpcTunnel) Send(pkt *client.Packet) error {
t.sendLock.Lock()
defer t.sendLock.Unlock()
const segment = commonmetrics.SegmentFromClient
metrics.Metrics.ObservePacket(segment, pkt.Type)
err := t.stream.Send(pkt)
if err != nil && err != io.EOF {
metrics.Metrics.ObserveStreamError(segment, err, pkt.Type)
}
return err
}
func (t *grpcTunnel) Recv() (*client.Packet, error) {
t.recvLock.Lock()
defer t.recvLock.Unlock()
const segment = commonmetrics.SegmentToClient
pkt, err := t.stream.Recv()
if err != nil {
if err != io.EOF {
metrics.Metrics.ObserveStreamErrorNoPacket(segment, err)
}
return nil, err
}
metrics.Metrics.ObservePacket(segment, pkt.Type)
return pkt, nil
}
func GetDialFailureReason(err error) (isDialFailure bool, reason metrics.DialFailureReason) {
var df *dialFailure
if errors.As(err, &df) {
return true, df.reason
}
return false, metrics.DialFailureUnknown
}
type dialFailure struct {
msg string
reason metrics.DialFailureReason
}
func (df *dialFailure) Error() string {
return df.msg
}

View File

@ -0,0 +1,157 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package client
import (
"errors"
"io"
"net"
"sync/atomic"
"time"
"k8s.io/klog/v2"
"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/proto/client"
)
// CloseTimeout is the timeout to wait CLOSE_RSP packet after a
// successful delivery of CLOSE_REQ.
const CloseTimeout = 10 * time.Second
var errConnTunnelClosed = errors.New("tunnel closed")
var errConnCloseTimeout = errors.New("close timeout")
// conn is an implementation of net.Conn, where the data is transported
// over an established tunnel defined by a gRPC service ProxyService.
type conn struct {
tunnel *grpcTunnel
// connID is set when a successful DIAL_RSP is received
connID int64
// random (dialID) is always initialized
random int64
readCh chan []byte
// On receiving CLOSE_RSP, closeCh will be sent any error message and closed.
closeCh chan string
rdata []byte
// closing is an atomic bool represented as a 0 or 1, and set to true when the connection is being closed.
// closing should only be accessed through atomic methods.
// TODO: switch this to an atomic.Bool once the client is exclusively buit with go1.19+
closing uint32
}
var _ net.Conn = &conn{}
// Write sends the data through the connection over proxy service
func (c *conn) Write(data []byte) (n int, err error) {
req := &client.Packet{
Type: client.PacketType_DATA,
Payload: &client.Packet_Data{
Data: &client.Data{
ConnectID: c.connID,
Data: data,
},
},
}
klog.V(5).InfoS("[tracing] send req", "type", req.Type)
err = c.tunnel.Send(req)
if err != nil {
return 0, err
}
return len(data), err
}
// Read receives data from the connection over proxy service
func (c *conn) Read(b []byte) (n int, err error) {
var data []byte
if c.rdata != nil {
data = c.rdata
} else {
data = <-c.readCh
}
if data == nil {
return 0, io.EOF
}
if len(data) > len(b) {
copy(b, data[:len(b)])
c.rdata = data[len(b):]
return len(b), nil
}
c.rdata = nil
copy(b, data)
return len(data), nil
}
func (c *conn) LocalAddr() net.Addr {
return nil
}
func (c *conn) RemoteAddr() net.Addr {
return nil
}
func (c *conn) SetDeadline(t time.Time) error {
return errors.New("not implemented")
}
func (c *conn) SetReadDeadline(t time.Time) error {
return errors.New("not implemented")
}
func (c *conn) SetWriteDeadline(t time.Time) error {
return errors.New("not implemented")
}
// Close closes the connection, sends best-effort close signal to proxy
// service, and frees resources.
func (c *conn) Close() error {
old := atomic.SwapUint32(&c.closing, 1)
if old != 0 {
// prevent duplicate messages
return nil
}
klog.V(4).Infoln("closing connection", "dialID", c.random, "connectionID", c.connID)
defer c.tunnel.closeTunnel()
if c.connID != 0 {
c.tunnel.sendCloseRequest(c.connID)
} else {
// Never received a DIAL response so no connection ID.
c.tunnel.sendDialClose(c.random)
}
select {
case errMsg := <-c.closeCh:
if errMsg != "" {
return errors.New(errMsg)
}
return nil
case <-c.tunnel.Done():
return errConnTunnelClosed
case <-time.After(CloseTimeout):
}
return errConnCloseTimeout
}

View File

@ -0,0 +1,164 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
commonmetrics "sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/common/metrics"
"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/proto/client"
)
const (
Namespace = "konnectivity_network_proxy"
Subsystem = "client"
)
var (
// Metrics provides access to all client metrics. The client
// application is responsible for registering (via Metrics.RegisterMetrics).
Metrics = newMetrics()
)
// ClientMetrics includes all the metrics of the konnectivity-client.
type ClientMetrics struct {
registerOnce sync.Once
streamPackets *prometheus.CounterVec
streamErrors *prometheus.CounterVec
dialFailures *prometheus.CounterVec
clientConns *prometheus.GaugeVec
}
type DialFailureReason string
const (
DialFailureUnknown DialFailureReason = "unknown"
// DialFailureTimeout indicates the hard 30 second timeout was hit.
DialFailureTimeout DialFailureReason = "timeout"
// DialFailureContext indicates that the context was cancelled or reached it's deadline before
// the dial response was returned.
DialFailureContext DialFailureReason = "context"
// DialFailureEndpoint indicates that the konnectivity-agent was unable to reach the backend endpoint.
DialFailureEndpoint DialFailureReason = "endpoint"
// DialFailureDialClosed indicates that the client received a CloseDial response, indicating the
// connection was closed before the dial could complete.
DialFailureDialClosed DialFailureReason = "dialclosed"
// DialFailureTunnelClosed indicates that the client connection was closed before the dial could
// complete.
DialFailureTunnelClosed DialFailureReason = "tunnelclosed"
// DialFailureAlreadyStarted indicates that a single-use tunnel dialer was already used once.
DialFailureAlreadyStarted DialFailureReason = "tunnelstarted"
)
type ClientConnectionStatus string
const (
// The connection is created but has not yet been dialed.
ClientConnectionStatusCreated ClientConnectionStatus = "created"
// The connection is pending dial response.
ClientConnectionStatusDialing ClientConnectionStatus = "dialing"
// The connection is established.
ClientConnectionStatusOk ClientConnectionStatus = "ok"
// The connection is closing.
ClientConnectionStatusClosing ClientConnectionStatus = "closing"
)
func newMetrics() *ClientMetrics {
// The denominator (total dials started) for both
// dial_failure_total and dial_duration_seconds is the
// stream_packets_total (common metric), where segment is
// "from_client" and packet_type is "DIAL_REQ".
dialFailures := prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "dial_failure_total",
Help: "Number of dial failures observed, by reason (example: remote endpoint error)",
},
[]string{
"reason",
},
)
clientConns := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "client_connections",
Help: "Number of open client connections, by status (Example: dialing)",
},
[]string{
"status",
},
)
return &ClientMetrics{
streamPackets: commonmetrics.MakeStreamPacketsTotalMetric(Namespace, Subsystem),
streamErrors: commonmetrics.MakeStreamErrorsTotalMetric(Namespace, Subsystem),
dialFailures: dialFailures,
clientConns: clientConns,
}
}
// RegisterMetrics registers all metrics with the client application.
func (c *ClientMetrics) RegisterMetrics(r prometheus.Registerer) {
c.registerOnce.Do(func() {
r.MustRegister(c.streamPackets)
r.MustRegister(c.streamErrors)
r.MustRegister(c.dialFailures)
r.MustRegister(c.clientConns)
})
}
// LegacyRegisterMetrics registers all metrics via MustRegister func.
// TODO: remove this once https://github.com/kubernetes/kubernetes/pull/114293 is available.
func (c *ClientMetrics) LegacyRegisterMetrics(mustRegisterFn func(...prometheus.Collector)) {
c.registerOnce.Do(func() {
mustRegisterFn(c.streamPackets)
mustRegisterFn(c.streamErrors)
mustRegisterFn(c.dialFailures)
mustRegisterFn(c.clientConns)
})
}
// Reset resets the metrics.
func (c *ClientMetrics) Reset() {
c.streamPackets.Reset()
c.streamErrors.Reset()
c.dialFailures.Reset()
c.clientConns.Reset()
}
func (c *ClientMetrics) ObserveDialFailure(reason DialFailureReason) {
c.dialFailures.WithLabelValues(string(reason)).Inc()
}
func (c *ClientMetrics) GetClientConnectionsMetric() *prometheus.GaugeVec {
return c.clientConns
}
func (c *ClientMetrics) ObservePacket(segment commonmetrics.Segment, packetType client.PacketType) {
commonmetrics.ObservePacket(c.streamPackets, segment, packetType)
}
func (c *ClientMetrics) ObserveStreamErrorNoPacket(segment commonmetrics.Segment, err error) {
commonmetrics.ObserveStreamErrorNoPacket(c.streamErrors, segment, err)
}
func (c *ClientMetrics) ObserveStreamError(segment commonmetrics.Segment, err error, packetType client.PacketType) {
commonmetrics.ObserveStreamError(c.streamErrors, segment, err, packetType)
}

View File

@ -0,0 +1,78 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package metrics provides metric definitions and helpers used
// across konnectivity client, server, and agent.
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc/status"
"sigs.k8s.io/apiserver-network-proxy/konnectivity-client/proto/client"
)
// Segment identifies one of four tunnel segments (e.g. from server to agent).
type Segment string
const (
// SegmentFromClient indicates a packet from client to server.
SegmentFromClient Segment = "from_client"
// SegmentToClient indicates a packet from server to client.
SegmentToClient Segment = "to_client"
// SegmentFromAgent indicates a packet from agent to server.
SegmentFromAgent Segment = "from_agent"
// SegmentToAgent indicates a packet from server to agent.
SegmentToAgent Segment = "to_agent"
)
func MakeStreamPacketsTotalMetric(namespace, subsystem string) *prometheus.CounterVec {
return prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "stream_packets_total",
Help: "Count of packets processed, by segment and packet type (example: from_client, DIAL_REQ)",
},
[]string{"segment", "packet_type"},
)
}
func MakeStreamErrorsTotalMetric(namespace, subsystem string) *prometheus.CounterVec {
return prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "stream_errors_total",
Help: "Count of gRPC stream errors, by segment, grpc Code, packet type. (example: from_agent, Code.Unavailable, DIAL_RSP)",
},
[]string{"segment", "code", "packet_type"},
)
}
func ObservePacket(m *prometheus.CounterVec, segment Segment, packetType client.PacketType) {
m.WithLabelValues(string(segment), packetType.String()).Inc()
}
func ObserveStreamErrorNoPacket(m *prometheus.CounterVec, segment Segment, err error) {
code := status.Code(err)
m.WithLabelValues(string(segment), code.String(), "Unknown").Inc()
}
func ObserveStreamError(m *prometheus.CounterVec, segment Segment, err error, packetType client.PacketType) {
code := status.Code(err)
m.WithLabelValues(string(segment), code.String(), packetType.String()).Inc()
}