2020-10-21 05:49:41 +00:00
|
|
|
/*
|
|
|
|
Copyright 2018 The Kubernetes Authors.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package manager
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2021-06-25 05:02:01 +00:00
|
|
|
"errors"
|
2020-10-21 05:49:41 +00:00
|
|
|
"fmt"
|
|
|
|
"net"
|
|
|
|
"net/http"
|
2023-06-01 17:01:19 +00:00
|
|
|
"net/http/pprof"
|
2020-10-21 05:49:41 +00:00
|
|
|
"sync"
|
2021-12-08 13:50:47 +00:00
|
|
|
"sync/atomic"
|
2020-10-21 05:49:41 +00:00
|
|
|
"time"
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
"github.com/go-logr/logr"
|
2020-10-21 05:49:41 +00:00
|
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
|
|
"k8s.io/apimachinery/pkg/runtime"
|
2021-06-25 05:02:01 +00:00
|
|
|
kerrors "k8s.io/apimachinery/pkg/util/errors"
|
2020-10-21 05:49:41 +00:00
|
|
|
"k8s.io/client-go/rest"
|
|
|
|
"k8s.io/client-go/tools/leaderelection"
|
|
|
|
"k8s.io/client-go/tools/leaderelection/resourcelock"
|
|
|
|
"k8s.io/client-go/tools/record"
|
|
|
|
|
|
|
|
"sigs.k8s.io/controller-runtime/pkg/cache"
|
|
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
2021-06-25 05:02:01 +00:00
|
|
|
"sigs.k8s.io/controller-runtime/pkg/cluster"
|
2023-06-01 17:01:19 +00:00
|
|
|
"sigs.k8s.io/controller-runtime/pkg/config"
|
2020-10-21 05:49:41 +00:00
|
|
|
"sigs.k8s.io/controller-runtime/pkg/healthz"
|
2021-12-08 13:50:47 +00:00
|
|
|
"sigs.k8s.io/controller-runtime/pkg/internal/httpserver"
|
2021-06-25 05:02:01 +00:00
|
|
|
intrec "sigs.k8s.io/controller-runtime/pkg/internal/recorder"
|
2023-08-28 20:44:55 +00:00
|
|
|
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
|
2020-10-21 05:49:41 +00:00
|
|
|
"sigs.k8s.io/controller-runtime/pkg/webhook"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2021-09-02 12:01:06 +00:00
|
|
|
// Values taken from: https://github.com/kubernetes/component-base/blob/master/config/v1alpha1/defaults.go
|
2021-06-25 05:02:01 +00:00
|
|
|
defaultLeaseDuration = 15 * time.Second
|
|
|
|
defaultRenewDeadline = 10 * time.Second
|
|
|
|
defaultRetryPeriod = 2 * time.Second
|
|
|
|
defaultGracefulShutdownPeriod = 30 * time.Second
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
defaultReadinessEndpoint = "/readyz"
|
|
|
|
defaultLivenessEndpoint = "/healthz"
|
|
|
|
)
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
var _ Runnable = &controllerManager{}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
type controllerManager struct {
|
2021-12-08 13:50:47 +00:00
|
|
|
sync.Mutex
|
|
|
|
started bool
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
stopProcedureEngaged *int64
|
|
|
|
errChan chan error
|
|
|
|
runnables *runnables
|
2021-06-25 05:02:01 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// cluster holds a variety of methods to interact with a cluster. Required.
|
|
|
|
cluster cluster.Cluster
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
// recorderProvider is used to generate event recorders that will be injected into Controllers
|
|
|
|
// (and EventHandlers, Sources and Predicates).
|
2021-06-25 05:02:01 +00:00
|
|
|
recorderProvider *intrec.Provider
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
// resourceLock forms the basis for leader election
|
|
|
|
resourceLock resourcelock.Interface
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// leaderElectionReleaseOnCancel defines if the manager should step back from the leader lease
|
|
|
|
// on shutdown
|
|
|
|
leaderElectionReleaseOnCancel bool
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2023-08-28 20:44:55 +00:00
|
|
|
// metricsServer is used to serve prometheus metrics
|
|
|
|
metricsServer metricsserver.Server
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
// healthProbeListener is used to serve liveness probe
|
|
|
|
healthProbeListener net.Listener
|
|
|
|
|
|
|
|
// Readiness probe endpoint name
|
|
|
|
readinessEndpointName string
|
|
|
|
|
|
|
|
// Liveness probe endpoint name
|
|
|
|
livenessEndpointName string
|
|
|
|
|
|
|
|
// Readyz probe handler
|
|
|
|
readyzHandler *healthz.Handler
|
|
|
|
|
|
|
|
// Healthz probe handler
|
|
|
|
healthzHandler *healthz.Handler
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
// pprofListener is used to serve pprof
|
|
|
|
pprofListener net.Listener
|
|
|
|
|
|
|
|
// controllerConfig are the global controller options.
|
|
|
|
controllerConfig config.Controller
|
2021-06-25 05:02:01 +00:00
|
|
|
|
|
|
|
// Logger is the logger that should be used by this manager.
|
|
|
|
// If none is set, it defaults to log.Log global logger.
|
|
|
|
logger logr.Logger
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// leaderElectionStopped is an internal channel used to signal the stopping procedure that the
|
|
|
|
// LeaderElection.Run(...) function has returned and the shutdown can proceed.
|
|
|
|
leaderElectionStopped chan struct{}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// leaderElectionCancel is used to cancel the leader election. It is distinct from internalStopper,
|
|
|
|
// because for safety reasons we need to os.Exit() when we lose the leader election, meaning that
|
|
|
|
// it must be deferred until after gracefulShutdown is done.
|
|
|
|
leaderElectionCancel context.CancelFunc
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
// elected is closed when this manager becomes the leader of a group of
|
|
|
|
// managers, either because it won a leader election or because no leader
|
|
|
|
// election was configured.
|
|
|
|
elected chan struct{}
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
webhookServer webhook.Server
|
2021-06-25 05:02:01 +00:00
|
|
|
// webhookServerOnce will be called in GetWebhookServer() to optionally initialize
|
|
|
|
// webhookServer if unset, and Add() it to controllerManager.
|
|
|
|
webhookServerOnce sync.Once
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2023-02-01 17:06:36 +00:00
|
|
|
// leaderElectionID is the name of the resource that leader election
|
|
|
|
// will use for holding the leader lock.
|
|
|
|
leaderElectionID string
|
2020-10-21 05:49:41 +00:00
|
|
|
// leaseDuration is the duration that non-leader candidates will
|
|
|
|
// wait to force acquire leadership.
|
|
|
|
leaseDuration time.Duration
|
2021-06-25 05:02:01 +00:00
|
|
|
// renewDeadline is the duration that the acting controlplane will retry
|
2020-10-21 05:49:41 +00:00
|
|
|
// refreshing leadership before giving up.
|
|
|
|
renewDeadline time.Duration
|
|
|
|
// retryPeriod is the duration the LeaderElector clients should wait
|
|
|
|
// between tries of actions.
|
|
|
|
retryPeriod time.Duration
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// gracefulShutdownTimeout is the duration given to runnable to stop
|
|
|
|
// before the manager actually returns on stop.
|
|
|
|
gracefulShutdownTimeout time.Duration
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// onStoppedLeading is callled when the leader election lease is lost.
|
|
|
|
// It can be overridden for tests.
|
|
|
|
onStoppedLeading func()
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// shutdownCtx is the context that can be used during shutdown. It will be cancelled
|
|
|
|
// after the gracefulShutdownTimeout ended. It must not be accessed before internalStop
|
|
|
|
// is closed because it will be nil.
|
|
|
|
shutdownCtx context.Context
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
internalCtx context.Context
|
|
|
|
internalCancel context.CancelFunc
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// internalProceduresStop channel is used internally to the manager when coordinating
|
|
|
|
// the proper shutdown of servers. This channel is also used for dependency injection.
|
|
|
|
internalProceduresStop chan struct{}
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
type hasCache interface {
|
|
|
|
Runnable
|
|
|
|
GetCache() cache.Cache
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add sets dependencies on i, and adds it to the list of Runnables to start.
|
|
|
|
func (cm *controllerManager) Add(r Runnable) error {
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.Lock()
|
|
|
|
defer cm.Unlock()
|
|
|
|
return cm.add(r)
|
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
func (cm *controllerManager) add(r Runnable) error {
|
|
|
|
return cm.runnables.Add(r)
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2024-05-13 20:57:03 +00:00
|
|
|
// AddMetricsServerExtraHandler adds extra handler served on path to the http server that serves metrics.
|
|
|
|
func (cm *controllerManager) AddMetricsServerExtraHandler(path string, handler http.Handler) error {
|
|
|
|
cm.Lock()
|
|
|
|
defer cm.Unlock()
|
|
|
|
if cm.started {
|
|
|
|
return fmt.Errorf("unable to add new metrics handler because metrics endpoint has already been created")
|
|
|
|
}
|
|
|
|
if cm.metricsServer == nil {
|
2024-08-12 20:39:28 +00:00
|
|
|
cm.GetLogger().Info("warn: metrics server is currently disabled, registering extra handler will be ignored", "path", path)
|
2024-05-13 20:57:03 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if err := cm.metricsServer.AddExtraHandler(path, handler); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
cm.logger.V(2).Info("Registering metrics http server extra handler", "path", path)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// AddHealthzCheck allows you to add Healthz checker.
|
2020-10-21 05:49:41 +00:00
|
|
|
func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker) error {
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.Lock()
|
|
|
|
defer cm.Unlock()
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
if cm.started {
|
2020-10-21 05:49:41 +00:00
|
|
|
return fmt.Errorf("unable to add new checker because healthz endpoint has already been created")
|
|
|
|
}
|
|
|
|
|
|
|
|
if cm.healthzHandler == nil {
|
|
|
|
cm.healthzHandler = &healthz.Handler{Checks: map[string]healthz.Checker{}}
|
|
|
|
}
|
|
|
|
|
|
|
|
cm.healthzHandler.Checks[name] = check
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// AddReadyzCheck allows you to add Readyz checker.
|
2020-10-21 05:49:41 +00:00
|
|
|
func (cm *controllerManager) AddReadyzCheck(name string, check healthz.Checker) error {
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.Lock()
|
|
|
|
defer cm.Unlock()
|
2021-06-25 05:02:01 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
if cm.started {
|
|
|
|
return fmt.Errorf("unable to add new checker because healthz endpoint has already been created")
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if cm.readyzHandler == nil {
|
|
|
|
cm.readyzHandler = &healthz.Handler{Checks: map[string]healthz.Checker{}}
|
|
|
|
}
|
|
|
|
|
|
|
|
cm.readyzHandler.Checks[name] = check
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
func (cm *controllerManager) GetHTTPClient() *http.Client {
|
|
|
|
return cm.cluster.GetHTTPClient()
|
|
|
|
}
|
|
|
|
|
2020-10-21 05:49:41 +00:00
|
|
|
func (cm *controllerManager) GetConfig() *rest.Config {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetConfig()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetClient() client.Client {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetClient()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetScheme() *runtime.Scheme {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetScheme()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetFieldIndexer() client.FieldIndexer {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetFieldIndexer()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetCache() cache.Cache {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetCache()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetEventRecorderFor(name string) record.EventRecorder {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetEventRecorderFor(name)
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetRESTMapper() meta.RESTMapper {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetRESTMapper()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) GetAPIReader() client.Reader {
|
2021-06-25 05:02:01 +00:00
|
|
|
return cm.cluster.GetAPIReader()
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
func (cm *controllerManager) GetWebhookServer() webhook.Server {
|
2021-06-25 05:02:01 +00:00
|
|
|
cm.webhookServerOnce.Do(func() {
|
|
|
|
if cm.webhookServer == nil {
|
2023-06-01 17:01:19 +00:00
|
|
|
panic("webhook should not be nil")
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
if err := cm.Add(cm.webhookServer); err != nil {
|
2021-12-08 13:50:47 +00:00
|
|
|
panic(fmt.Sprintf("unable to add webhook server to the controller manager: %s", err))
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
2021-06-25 05:02:01 +00:00
|
|
|
})
|
2020-10-21 05:49:41 +00:00
|
|
|
return cm.webhookServer
|
|
|
|
}
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
func (cm *controllerManager) GetLogger() logr.Logger {
|
|
|
|
return cm.logger
|
|
|
|
}
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
func (cm *controllerManager) GetControllerOptions() config.Controller {
|
|
|
|
return cm.controllerConfig
|
2021-06-25 05:02:01 +00:00
|
|
|
}
|
|
|
|
|
2023-08-28 20:44:55 +00:00
|
|
|
func (cm *controllerManager) addHealthProbeServer() error {
|
2023-06-01 17:01:19 +00:00
|
|
|
mux := http.NewServeMux()
|
|
|
|
srv := httpserver.New(mux)
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
if cm.readyzHandler != nil {
|
|
|
|
mux.Handle(cm.readinessEndpointName, http.StripPrefix(cm.readinessEndpointName, cm.readyzHandler))
|
|
|
|
// Append '/' suffix to handle subpaths
|
|
|
|
mux.Handle(cm.readinessEndpointName+"/", http.StripPrefix(cm.readinessEndpointName, cm.readyzHandler))
|
|
|
|
}
|
|
|
|
if cm.healthzHandler != nil {
|
|
|
|
mux.Handle(cm.livenessEndpointName, http.StripPrefix(cm.livenessEndpointName, cm.healthzHandler))
|
|
|
|
// Append '/' suffix to handle subpaths
|
|
|
|
mux.Handle(cm.livenessEndpointName+"/", http.StripPrefix(cm.livenessEndpointName, cm.healthzHandler))
|
|
|
|
}
|
2021-06-25 05:02:01 +00:00
|
|
|
|
2024-05-13 20:57:03 +00:00
|
|
|
return cm.add(&Server{
|
|
|
|
Name: "health probe",
|
2023-08-28 20:44:55 +00:00
|
|
|
Server: srv,
|
|
|
|
Listener: cm.healthProbeListener,
|
|
|
|
})
|
2021-12-08 13:50:47 +00:00
|
|
|
}
|
2021-06-25 05:02:01 +00:00
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
func (cm *controllerManager) addPprofServer() error {
|
|
|
|
mux := http.NewServeMux()
|
|
|
|
srv := httpserver.New(mux)
|
|
|
|
|
|
|
|
mux.HandleFunc("/debug/pprof/", pprof.Index)
|
|
|
|
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
|
|
|
|
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
|
|
|
|
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
|
|
|
|
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
|
|
|
|
|
2024-05-13 20:57:03 +00:00
|
|
|
return cm.add(&Server{
|
|
|
|
Name: "pprof",
|
2023-06-01 17:01:19 +00:00
|
|
|
Server: srv,
|
|
|
|
Listener: cm.pprofListener,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Start starts the manager and waits indefinitely.
|
|
|
|
// There is only two ways to have start return:
|
|
|
|
// An error has occurred during in one of the internal operations,
|
|
|
|
// such as leader election, cache start, webhooks, and so on.
|
|
|
|
// Or, the context is cancelled.
|
2021-06-25 05:02:01 +00:00
|
|
|
func (cm *controllerManager) Start(ctx context.Context) (err error) {
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.Lock()
|
|
|
|
if cm.started {
|
|
|
|
cm.Unlock()
|
|
|
|
return errors.New("manager already started")
|
2021-06-25 05:02:01 +00:00
|
|
|
}
|
2023-02-01 17:06:36 +00:00
|
|
|
cm.started = true
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
var ready bool
|
|
|
|
defer func() {
|
|
|
|
// Only unlock the manager if we haven't reached
|
|
|
|
// the internal readiness condition.
|
|
|
|
if !ready {
|
|
|
|
cm.Unlock()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
// Initialize the internal context.
|
2021-06-25 05:02:01 +00:00
|
|
|
cm.internalCtx, cm.internalCancel = context.WithCancel(ctx)
|
|
|
|
|
2024-08-19 08:02:11 +00:00
|
|
|
// Leader elector must be created before defer that contains engageStopProcedure function
|
|
|
|
// https://github.com/kubernetes-sigs/controller-runtime/issues/2873
|
|
|
|
var leaderElector *leaderelection.LeaderElector
|
|
|
|
if cm.resourceLock != nil {
|
|
|
|
leaderElector, err = cm.initLeaderElector()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed during initialization leader election process: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-25 05:02:01 +00:00
|
|
|
// This chan indicates that stop is complete, in other words all runnables have returned or timeout on stop request
|
|
|
|
stopComplete := make(chan struct{})
|
|
|
|
defer close(stopComplete)
|
|
|
|
// This must be deferred after closing stopComplete, otherwise we deadlock.
|
|
|
|
defer func() {
|
|
|
|
// https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/gettyimages-459889618-1533579787.jpg
|
|
|
|
stopErr := cm.engageStopProcedure(stopComplete)
|
|
|
|
if stopErr != nil {
|
|
|
|
if err != nil {
|
|
|
|
// Utilerrors.Aggregate allows to use errors.Is for all contained errors
|
|
|
|
// whereas fmt.Errorf allows wrapping at most one error which means the
|
|
|
|
// other one can not be found anymore.
|
|
|
|
err = kerrors.NewAggregate([]error{err, stopErr})
|
|
|
|
} else {
|
|
|
|
err = stopErr
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Add the cluster runnable.
|
|
|
|
if err := cm.add(cm.cluster); err != nil {
|
|
|
|
return fmt.Errorf("failed to add cluster to runnables: %w", err)
|
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
|
|
|
// Metrics should be served whether the controller is leader or not.
|
|
|
|
// (If we don't serve metrics for non-leaders, prometheus will still scrape
|
2021-12-08 13:50:47 +00:00
|
|
|
// the pod but will get a connection refused).
|
2023-08-28 20:44:55 +00:00
|
|
|
if cm.metricsServer != nil {
|
|
|
|
// Note: We are adding the metrics server directly to HTTPServers here as matching on the
|
|
|
|
// metricsserver.Server interface in cm.runnables.Add would be very brittle.
|
|
|
|
if err := cm.runnables.HTTPServers.Add(cm.metricsServer, nil); err != nil {
|
2023-06-01 17:01:19 +00:00
|
|
|
return fmt.Errorf("failed to add metrics server: %w", err)
|
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Serve health probes.
|
2020-10-21 05:49:41 +00:00
|
|
|
if cm.healthProbeListener != nil {
|
2023-08-28 20:44:55 +00:00
|
|
|
if err := cm.addHealthProbeServer(); err != nil {
|
|
|
|
return fmt.Errorf("failed to add health probe server: %w", err)
|
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2023-06-01 17:01:19 +00:00
|
|
|
// Add pprof server
|
|
|
|
if cm.pprofListener != nil {
|
|
|
|
if err := cm.addPprofServer(); err != nil {
|
|
|
|
return fmt.Errorf("failed to add pprof server: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-13 20:57:03 +00:00
|
|
|
// First start any HTTP servers, which includes health probes, metrics and profiling if enabled.
|
2023-08-28 20:44:55 +00:00
|
|
|
//
|
2024-05-13 20:57:03 +00:00
|
|
|
// WARNING: HTTPServers includes the health probes, which MUST start before any cache is populated, otherwise
|
|
|
|
// it would block conversion webhooks to be ready for serving which make the cache never get ready.
|
|
|
|
logCtx := logr.NewContext(cm.internalCtx, cm.logger)
|
|
|
|
if err := cm.runnables.HTTPServers.Start(logCtx); err != nil {
|
|
|
|
return fmt.Errorf("failed to start HTTP servers: %w", err)
|
2023-08-28 20:44:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Start any webhook servers, which includes conversion, validation, and defaulting
|
2021-12-08 13:50:47 +00:00
|
|
|
// webhooks that are registered.
|
|
|
|
//
|
|
|
|
// WARNING: Webhooks MUST start before any cache is populated, otherwise there is a race condition
|
2021-11-15 20:30:19 +00:00
|
|
|
// between conversion webhooks and the cache sync (usually initial list) which causes the webhooks
|
|
|
|
// to never start because no cache can be populated.
|
2021-12-08 13:50:47 +00:00
|
|
|
if err := cm.runnables.Webhooks.Start(cm.internalCtx); err != nil {
|
2024-05-13 20:57:03 +00:00
|
|
|
return fmt.Errorf("failed to start webhooks: %w", err)
|
2021-12-08 13:50:47 +00:00
|
|
|
}
|
2021-11-15 20:30:19 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Start and wait for caches.
|
|
|
|
if err := cm.runnables.Caches.Start(cm.internalCtx); err != nil {
|
2024-05-13 20:57:03 +00:00
|
|
|
return fmt.Errorf("failed to start caches: %w", err)
|
2021-12-08 13:50:47 +00:00
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Start the non-leaderelection Runnables after the cache has synced.
|
|
|
|
if err := cm.runnables.Others.Start(cm.internalCtx); err != nil {
|
2024-05-13 20:57:03 +00:00
|
|
|
return fmt.Errorf("failed to start other runnables: %w", err)
|
2021-12-08 13:50:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Start the leader election and all required runnables.
|
|
|
|
{
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
cm.leaderElectionCancel = cancel
|
2024-08-19 08:02:11 +00:00
|
|
|
if leaderElector != nil {
|
|
|
|
// Start the leader elector process
|
|
|
|
go func() {
|
|
|
|
leaderElector.Run(ctx)
|
|
|
|
<-ctx.Done()
|
|
|
|
close(cm.leaderElectionStopped)
|
|
|
|
}()
|
|
|
|
} else {
|
|
|
|
go func() {
|
2021-12-08 13:50:47 +00:00
|
|
|
// Treat not having leader election enabled the same as being elected.
|
|
|
|
if err := cm.startLeaderElectionRunnables(); err != nil {
|
|
|
|
cm.errChan <- err
|
|
|
|
}
|
|
|
|
close(cm.elected)
|
2024-08-19 08:02:11 +00:00
|
|
|
}()
|
|
|
|
}
|
2021-12-08 13:50:47 +00:00
|
|
|
}
|
2020-10-21 05:49:41 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
ready = true
|
|
|
|
cm.Unlock()
|
2020-10-21 05:49:41 +00:00
|
|
|
select {
|
2021-06-25 05:02:01 +00:00
|
|
|
case <-ctx.Done():
|
2020-10-21 05:49:41 +00:00
|
|
|
// We are done
|
|
|
|
return nil
|
2021-06-25 05:02:01 +00:00
|
|
|
case err := <-cm.errChan:
|
|
|
|
// Error starting or running a runnable
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// engageStopProcedure signals all runnables to stop, reads potential errors
|
|
|
|
// from the errChan and waits for them to end. It must not be called more than once.
|
|
|
|
func (cm *controllerManager) engageStopProcedure(stopComplete <-chan struct{}) error {
|
2021-12-08 13:50:47 +00:00
|
|
|
if !atomic.CompareAndSwapInt64(cm.stopProcedureEngaged, 0, 1) {
|
|
|
|
return errors.New("stop procedure already engaged")
|
2021-06-25 05:02:01 +00:00
|
|
|
}
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Populate the shutdown context, this operation MUST be done before
|
|
|
|
// closing the internalProceduresStop channel.
|
|
|
|
//
|
|
|
|
// The shutdown context immediately expires if the gracefulShutdownTimeout is not set.
|
|
|
|
var shutdownCancel context.CancelFunc
|
2023-04-18 08:08:00 +00:00
|
|
|
if cm.gracefulShutdownTimeout < 0 {
|
|
|
|
// We want to wait forever for the runnables to stop.
|
|
|
|
cm.shutdownCtx, shutdownCancel = context.WithCancel(context.Background())
|
|
|
|
} else {
|
|
|
|
cm.shutdownCtx, shutdownCancel = context.WithTimeout(context.Background(), cm.gracefulShutdownTimeout)
|
|
|
|
}
|
2021-12-08 13:50:47 +00:00
|
|
|
defer shutdownCancel()
|
2021-06-25 05:02:01 +00:00
|
|
|
|
|
|
|
// Start draining the errors before acquiring the lock to make sure we don't deadlock
|
|
|
|
// if something that has the lock is blocked on trying to write into the unbuffered
|
|
|
|
// channel after something else already wrote into it.
|
2021-12-08 13:50:47 +00:00
|
|
|
var closeOnce sync.Once
|
2021-06-25 05:02:01 +00:00
|
|
|
go func() {
|
|
|
|
for {
|
2021-12-08 13:50:47 +00:00
|
|
|
// Closing in the for loop is required to avoid race conditions between
|
|
|
|
// the closure of all internal procedures and making sure to have a reader off the error channel.
|
|
|
|
closeOnce.Do(func() {
|
|
|
|
// Cancel the internal stop channel and wait for the procedures to stop and complete.
|
|
|
|
close(cm.internalProceduresStop)
|
|
|
|
cm.internalCancel()
|
|
|
|
})
|
2021-06-25 05:02:01 +00:00
|
|
|
select {
|
2024-08-19 08:02:11 +00:00
|
|
|
case err := <-cm.errChan:
|
|
|
|
if !errors.Is(err, context.Canceled) {
|
2021-06-25 05:02:01 +00:00
|
|
|
cm.logger.Error(err, "error received after stop sequence was engaged")
|
|
|
|
}
|
|
|
|
case <-stopComplete:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// We want to close this after the other runnables stop, because we don't
|
2021-06-25 05:02:01 +00:00
|
|
|
// want things like leader election to try and emit events on a closed
|
|
|
|
// channel
|
|
|
|
defer cm.recorderProvider.Stop(cm.shutdownCtx)
|
|
|
|
defer func() {
|
2021-12-08 13:50:47 +00:00
|
|
|
// Cancel leader election only after we waited. It will os.Exit() the app for safety.
|
|
|
|
if cm.resourceLock != nil {
|
2021-06-25 05:02:01 +00:00
|
|
|
// After asking the context to be cancelled, make sure
|
|
|
|
// we wait for the leader stopped channel to be closed, otherwise
|
|
|
|
// we might encounter race conditions between this code
|
|
|
|
// and the event recorder, which is used within leader election code.
|
|
|
|
cm.leaderElectionCancel()
|
|
|
|
<-cm.leaderElectionStopped
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
go func() {
|
2021-12-08 13:50:47 +00:00
|
|
|
// First stop the non-leader election runnables.
|
|
|
|
cm.logger.Info("Stopping and waiting for non leader election runnables")
|
|
|
|
cm.runnables.Others.StopAndWait(cm.shutdownCtx)
|
|
|
|
|
|
|
|
// Stop all the leader election runnables, which includes reconcilers.
|
|
|
|
cm.logger.Info("Stopping and waiting for leader election runnables")
|
2024-04-08 20:57:45 +00:00
|
|
|
// Prevent leader election when shutting down a non-elected manager
|
|
|
|
cm.runnables.LeaderElection.startOnce.Do(func() {})
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.runnables.LeaderElection.StopAndWait(cm.shutdownCtx)
|
|
|
|
|
|
|
|
// Stop the caches before the leader election runnables, this is an important
|
|
|
|
// step to make sure that we don't race with the reconcilers by receiving more events
|
|
|
|
// from the API servers and enqueueing them.
|
|
|
|
cm.logger.Info("Stopping and waiting for caches")
|
|
|
|
cm.runnables.Caches.StopAndWait(cm.shutdownCtx)
|
|
|
|
|
2023-08-28 20:44:55 +00:00
|
|
|
// Webhooks and internal HTTP servers should come last, as they might be still serving some requests.
|
2021-12-08 13:50:47 +00:00
|
|
|
cm.logger.Info("Stopping and waiting for webhooks")
|
|
|
|
cm.runnables.Webhooks.StopAndWait(cm.shutdownCtx)
|
|
|
|
|
2023-08-28 20:44:55 +00:00
|
|
|
cm.logger.Info("Stopping and waiting for HTTP servers")
|
|
|
|
cm.runnables.HTTPServers.StopAndWait(cm.shutdownCtx)
|
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
// Proceed to close the manager and overall shutdown context.
|
|
|
|
cm.logger.Info("Wait completed, proceeding to shutdown the manager")
|
2021-06-25 05:02:01 +00:00
|
|
|
shutdownCancel()
|
|
|
|
}()
|
|
|
|
|
|
|
|
<-cm.shutdownCtx.Done()
|
2023-02-01 17:06:36 +00:00
|
|
|
if err := cm.shutdownCtx.Err(); err != nil && !errors.Is(err, context.Canceled) {
|
2021-12-08 13:50:47 +00:00
|
|
|
if errors.Is(err, context.DeadlineExceeded) {
|
|
|
|
if cm.gracefulShutdownTimeout > 0 {
|
|
|
|
return fmt.Errorf("failed waiting for all runnables to end within grace period of %s: %w", cm.gracefulShutdownTimeout, err)
|
|
|
|
}
|
|
|
|
return nil
|
2021-06-25 05:02:01 +00:00
|
|
|
}
|
2021-12-08 13:50:47 +00:00
|
|
|
// For any other error, return the error.
|
|
|
|
return err
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
2023-02-01 17:06:36 +00:00
|
|
|
|
2021-12-08 13:50:47 +00:00
|
|
|
return nil
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2024-08-19 08:02:11 +00:00
|
|
|
func (cm *controllerManager) initLeaderElector() (*leaderelection.LeaderElector, error) {
|
|
|
|
leaderElector, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{
|
2020-10-21 05:49:41 +00:00
|
|
|
Lock: cm.resourceLock,
|
|
|
|
LeaseDuration: cm.leaseDuration,
|
|
|
|
RenewDeadline: cm.renewDeadline,
|
|
|
|
RetryPeriod: cm.retryPeriod,
|
|
|
|
Callbacks: leaderelection.LeaderCallbacks{
|
|
|
|
OnStartedLeading: func(_ context.Context) {
|
2021-12-08 13:50:47 +00:00
|
|
|
if err := cm.startLeaderElectionRunnables(); err != nil {
|
|
|
|
cm.errChan <- err
|
|
|
|
return
|
|
|
|
}
|
2021-06-25 05:02:01 +00:00
|
|
|
close(cm.elected)
|
2020-10-21 05:49:41 +00:00
|
|
|
},
|
2021-12-08 13:50:47 +00:00
|
|
|
OnStoppedLeading: func() {
|
|
|
|
if cm.onStoppedLeading != nil {
|
|
|
|
cm.onStoppedLeading()
|
|
|
|
}
|
|
|
|
// Make sure graceful shutdown is skipped if we lost the leader lock without
|
|
|
|
// intending to.
|
|
|
|
cm.gracefulShutdownTimeout = time.Duration(0)
|
|
|
|
// Most implementations of leader election log.Fatal() here.
|
|
|
|
// Since Start is wrapped in log.Fatal when called, we can just return
|
|
|
|
// an error here which will cause the program to exit.
|
|
|
|
cm.errChan <- errors.New("leader election lost")
|
|
|
|
},
|
2020-10-21 05:49:41 +00:00
|
|
|
},
|
2021-06-25 05:02:01 +00:00
|
|
|
ReleaseOnCancel: cm.leaderElectionReleaseOnCancel,
|
2023-02-01 17:06:36 +00:00
|
|
|
Name: cm.leaderElectionID,
|
2020-10-21 05:49:41 +00:00
|
|
|
})
|
|
|
|
if err != nil {
|
2024-08-19 08:02:11 +00:00
|
|
|
return nil, err
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
2024-08-19 08:02:11 +00:00
|
|
|
return leaderElector, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) startLeaderElectionRunnables() error {
|
|
|
|
return cm.runnables.LeaderElection.Start(cm.internalCtx)
|
2020-10-21 05:49:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (cm *controllerManager) Elected() <-chan struct{} {
|
|
|
|
return cm.elected
|
|
|
|
}
|