mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-06-13 18:43:34 +00:00
Update to kube v1.17
Signed-off-by: Humble Chirammal <hchiramm@redhat.com>
This commit is contained in:
committed by
mergify[bot]
parent
327fcd1b1b
commit
3af1e26d7c
80
vendor/go.etcd.io/etcd/raft/bootstrap.go
generated
vendored
Normal file
80
vendor/go.etcd.io/etcd/raft/bootstrap.go
generated
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
// Bootstrap initializes the RawNode for first use by appending configuration
|
||||
// changes for the supplied peers. This method returns an error if the Storage
|
||||
// is nonempty.
|
||||
//
|
||||
// It is recommended that instead of calling this method, applications bootstrap
|
||||
// their state manually by setting up a Storage that has a first index > 1 and
|
||||
// which stores the desired ConfState as its InitialState.
|
||||
func (rn *RawNode) Bootstrap(peers []Peer) error {
|
||||
if len(peers) == 0 {
|
||||
return errors.New("must provide at least one peer to Bootstrap")
|
||||
}
|
||||
lastIndex, err := rn.raft.raftLog.storage.LastIndex()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if lastIndex != 0 {
|
||||
return errors.New("can't bootstrap a nonempty Storage")
|
||||
}
|
||||
|
||||
// We've faked out initial entries above, but nothing has been
|
||||
// persisted. Start with an empty HardState (thus the first Ready will
|
||||
// emit a HardState update for the app to persist).
|
||||
rn.prevHardSt = emptyState
|
||||
|
||||
// TODO(tbg): remove StartNode and give the application the right tools to
|
||||
// bootstrap the initial membership in a cleaner way.
|
||||
rn.raft.becomeFollower(1, None)
|
||||
ents := make([]pb.Entry, len(peers))
|
||||
for i, peer := range peers {
|
||||
cc := pb.ConfChange{Type: pb.ConfChangeAddNode, NodeID: peer.ID, Context: peer.Context}
|
||||
data, err := cc.Marshal()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ents[i] = pb.Entry{Type: pb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: data}
|
||||
}
|
||||
rn.raft.raftLog.append(ents...)
|
||||
|
||||
// Now apply them, mainly so that the application can call Campaign
|
||||
// immediately after StartNode in tests. Note that these nodes will
|
||||
// be added to raft twice: here and when the application's Ready
|
||||
// loop calls ApplyConfChange. The calls to addNode must come after
|
||||
// all calls to raftLog.append so progress.next is set after these
|
||||
// bootstrapping entries (it is an error if we try to append these
|
||||
// entries since they have already been committed).
|
||||
// We do not set raftLog.applied so the application will be able
|
||||
// to observe all conf changes via Ready.CommittedEntries.
|
||||
//
|
||||
// TODO(bdarnell): These entries are still unstable; do we need to preserve
|
||||
// the invariant that committed < unstable?
|
||||
rn.raft.raftLog.committed = uint64(len(ents))
|
||||
for _, peer := range peers {
|
||||
rn.raft.applyConfChange(pb.ConfChange{NodeID: peer.ID, Type: pb.ConfChangeAddNode}.AsV2())
|
||||
}
|
||||
return nil
|
||||
}
|
425
vendor/go.etcd.io/etcd/raft/confchange/confchange.go
generated
vendored
Normal file
425
vendor/go.etcd.io/etcd/raft/confchange/confchange.go
generated
vendored
Normal file
@ -0,0 +1,425 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package confchange
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"go.etcd.io/etcd/raft/quorum"
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
"go.etcd.io/etcd/raft/tracker"
|
||||
)
|
||||
|
||||
// Changer facilitates configuration changes. It exposes methods to handle
|
||||
// simple and joint consensus while performing the proper validation that allows
|
||||
// refusing invalid configuration changes before they affect the active
|
||||
// configuration.
|
||||
type Changer struct {
|
||||
Tracker tracker.ProgressTracker
|
||||
LastIndex uint64
|
||||
}
|
||||
|
||||
// EnterJoint verifies that the outgoing (=right) majority config of the joint
|
||||
// config is empty and initializes it with a copy of the incoming (=left)
|
||||
// majority config. That is, it transitions from
|
||||
//
|
||||
// (1 2 3)&&()
|
||||
// to
|
||||
// (1 2 3)&&(1 2 3).
|
||||
//
|
||||
// The supplied changes are then applied to the incoming majority config,
|
||||
// resulting in a joint configuration that in terms of the Raft thesis[1]
|
||||
// (Section 4.3) corresponds to `C_{new,old}`.
|
||||
//
|
||||
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
||||
func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
|
||||
cfg, prs, err := c.checkAndCopy()
|
||||
if err != nil {
|
||||
return c.err(err)
|
||||
}
|
||||
if joint(cfg) {
|
||||
err := errors.New("config is already joint")
|
||||
return c.err(err)
|
||||
}
|
||||
if len(incoming(cfg.Voters)) == 0 {
|
||||
// We allow adding nodes to an empty config for convenience (testing and
|
||||
// bootstrap), but you can't enter a joint state.
|
||||
err := errors.New("can't make a zero-voter config joint")
|
||||
return c.err(err)
|
||||
}
|
||||
// Clear the outgoing config.
|
||||
*outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
|
||||
// Copy incoming to outgoing.
|
||||
for id := range incoming(cfg.Voters) {
|
||||
outgoing(cfg.Voters)[id] = struct{}{}
|
||||
}
|
||||
|
||||
if err := c.apply(&cfg, prs, ccs...); err != nil {
|
||||
return c.err(err)
|
||||
}
|
||||
cfg.AutoLeave = autoLeave
|
||||
return checkAndReturn(cfg, prs)
|
||||
}
|
||||
|
||||
// LeaveJoint transitions out of a joint configuration. It is an error to call
|
||||
// this method if the configuration is not joint, i.e. if the outgoing majority
|
||||
// config Voters[1] is empty.
|
||||
//
|
||||
// The outgoing majority config of the joint configuration will be removed,
|
||||
// that is, the incoming config is promoted as the sole decision maker. In the
|
||||
// notation of the Raft thesis[1] (Section 4.3), this method transitions from
|
||||
// `C_{new,old}` into `C_new`.
|
||||
//
|
||||
// At the same time, any staged learners (LearnersNext) the addition of which
|
||||
// was held back by an overlapping voter in the former outgoing config will be
|
||||
// inserted into Learners.
|
||||
//
|
||||
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
||||
func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
|
||||
cfg, prs, err := c.checkAndCopy()
|
||||
if err != nil {
|
||||
return c.err(err)
|
||||
}
|
||||
if !joint(cfg) {
|
||||
err := errors.New("can't leave a non-joint config")
|
||||
return c.err(err)
|
||||
}
|
||||
if len(outgoing(cfg.Voters)) == 0 {
|
||||
err := fmt.Errorf("configuration is not joint: %v", cfg)
|
||||
return c.err(err)
|
||||
}
|
||||
for id := range cfg.LearnersNext {
|
||||
nilAwareAdd(&cfg.Learners, id)
|
||||
prs[id].IsLearner = true
|
||||
}
|
||||
cfg.LearnersNext = nil
|
||||
|
||||
for id := range outgoing(cfg.Voters) {
|
||||
_, isVoter := incoming(cfg.Voters)[id]
|
||||
_, isLearner := cfg.Learners[id]
|
||||
|
||||
if !isVoter && !isLearner {
|
||||
delete(prs, id)
|
||||
}
|
||||
}
|
||||
*outgoingPtr(&cfg.Voters) = nil
|
||||
cfg.AutoLeave = false
|
||||
|
||||
return checkAndReturn(cfg, prs)
|
||||
}
|
||||
|
||||
// Simple carries out a series of configuration changes that (in aggregate)
|
||||
// mutates the incoming majority config Voters[0] by at most one. This method
|
||||
// will return an error if that is not the case, if the resulting quorum is
|
||||
// zero, or if the configuration is in a joint state (i.e. if there is an
|
||||
// outgoing configuration).
|
||||
func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
|
||||
cfg, prs, err := c.checkAndCopy()
|
||||
if err != nil {
|
||||
return c.err(err)
|
||||
}
|
||||
if joint(cfg) {
|
||||
err := errors.New("can't apply simple config change in joint config")
|
||||
return c.err(err)
|
||||
}
|
||||
if err := c.apply(&cfg, prs, ccs...); err != nil {
|
||||
return c.err(err)
|
||||
}
|
||||
if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
|
||||
return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
|
||||
}
|
||||
if err := checkInvariants(cfg, prs); err != nil {
|
||||
return tracker.Config{}, tracker.ProgressMap{}, nil
|
||||
}
|
||||
|
||||
return checkAndReturn(cfg, prs)
|
||||
}
|
||||
|
||||
// apply a change to the configuration. By convention, changes to voters are
|
||||
// always made to the incoming majority config Voters[0]. Voters[1] is either
|
||||
// empty or preserves the outgoing majority configuration while in a joint state.
|
||||
func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
|
||||
for _, cc := range ccs {
|
||||
if cc.NodeID == 0 {
|
||||
// etcd replaces the NodeID with zero if it decides (downstream of
|
||||
// raft) to not apply a change, so we have to have explicit code
|
||||
// here to ignore these.
|
||||
continue
|
||||
}
|
||||
switch cc.Type {
|
||||
case pb.ConfChangeAddNode:
|
||||
c.makeVoter(cfg, prs, cc.NodeID)
|
||||
case pb.ConfChangeAddLearnerNode:
|
||||
c.makeLearner(cfg, prs, cc.NodeID)
|
||||
case pb.ConfChangeRemoveNode:
|
||||
c.remove(cfg, prs, cc.NodeID)
|
||||
case pb.ConfChangeUpdateNode:
|
||||
default:
|
||||
return fmt.Errorf("unexpected conf type %d", cc.Type)
|
||||
}
|
||||
}
|
||||
if len(incoming(cfg.Voters)) == 0 {
|
||||
return errors.New("removed all voters")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// makeVoter adds or promotes the given ID to be a voter in the incoming
|
||||
// majority config.
|
||||
func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
|
||||
pr := prs[id]
|
||||
if pr == nil {
|
||||
c.initProgress(cfg, prs, id, false /* isLearner */)
|
||||
return
|
||||
}
|
||||
|
||||
pr.IsLearner = false
|
||||
nilAwareDelete(&cfg.Learners, id)
|
||||
nilAwareDelete(&cfg.LearnersNext, id)
|
||||
incoming(cfg.Voters)[id] = struct{}{}
|
||||
return
|
||||
}
|
||||
|
||||
// makeLearner makes the given ID a learner or stages it to be a learner once
|
||||
// an active joint configuration is exited.
|
||||
//
|
||||
// The former happens when the peer is not a part of the outgoing config, in
|
||||
// which case we either add a new learner or demote a voter in the incoming
|
||||
// config.
|
||||
//
|
||||
// The latter case occurs when the configuration is joint and the peer is a
|
||||
// voter in the outgoing config. In that case, we do not want to add the peer
|
||||
// as a learner because then we'd have to track a peer as a voter and learner
|
||||
// simultaneously. Instead, we add the learner to LearnersNext, so that it will
|
||||
// be added to Learners the moment the outgoing config is removed by
|
||||
// LeaveJoint().
|
||||
func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
|
||||
pr := prs[id]
|
||||
if pr == nil {
|
||||
c.initProgress(cfg, prs, id, true /* isLearner */)
|
||||
return
|
||||
}
|
||||
if pr.IsLearner {
|
||||
return
|
||||
}
|
||||
// Remove any existing voter in the incoming config...
|
||||
c.remove(cfg, prs, id)
|
||||
// ... but save the Progress.
|
||||
prs[id] = pr
|
||||
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
|
||||
// if the peer is still tracked as a voter in the outgoing config. It will
|
||||
// be turned into a learner in LeaveJoint().
|
||||
//
|
||||
// Otherwise, add a regular learner right away.
|
||||
if _, onRight := outgoing(cfg.Voters)[id]; onRight {
|
||||
nilAwareAdd(&cfg.LearnersNext, id)
|
||||
} else {
|
||||
pr.IsLearner = true
|
||||
nilAwareAdd(&cfg.Learners, id)
|
||||
}
|
||||
}
|
||||
|
||||
// remove this peer as a voter or learner from the incoming config.
|
||||
func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
|
||||
if _, ok := prs[id]; !ok {
|
||||
return
|
||||
}
|
||||
|
||||
delete(incoming(cfg.Voters), id)
|
||||
nilAwareDelete(&cfg.Learners, id)
|
||||
nilAwareDelete(&cfg.LearnersNext, id)
|
||||
|
||||
// If the peer is still a voter in the outgoing config, keep the Progress.
|
||||
if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
|
||||
delete(prs, id)
|
||||
}
|
||||
}
|
||||
|
||||
// initProgress initializes a new progress for the given node or learner.
|
||||
func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
|
||||
if !isLearner {
|
||||
incoming(cfg.Voters)[id] = struct{}{}
|
||||
} else {
|
||||
nilAwareAdd(&cfg.Learners, id)
|
||||
}
|
||||
prs[id] = &tracker.Progress{
|
||||
// Initializing the Progress with the last index means that the follower
|
||||
// can be probed (with the last index).
|
||||
//
|
||||
// TODO(tbg): seems awfully optimistic. Using the first index would be
|
||||
// better. The general expectation here is that the follower has no log
|
||||
// at all (and will thus likely need a snapshot), though the app may
|
||||
// have applied a snapshot out of band before adding the replica (thus
|
||||
// making the first index the better choice).
|
||||
Next: c.LastIndex,
|
||||
Match: 0,
|
||||
Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
|
||||
IsLearner: isLearner,
|
||||
// When a node is first added, we should mark it as recently active.
|
||||
// Otherwise, CheckQuorum may cause us to step down if it is invoked
|
||||
// before the added node has had a chance to communicate with us.
|
||||
RecentActive: true,
|
||||
}
|
||||
}
|
||||
|
||||
// checkInvariants makes sure that the config and progress are compatible with
|
||||
// each other. This is used to check both what the Changer is initialized with,
|
||||
// as well as what it returns.
|
||||
func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
|
||||
// NB: intentionally allow the empty config. In production we'll never see a
|
||||
// non-empty config (we prevent it from being created) but we will need to
|
||||
// be able to *create* an initial config, for example during bootstrap (or
|
||||
// during tests). Instead of having to hand-code this, we allow
|
||||
// transitioning from an empty config into any other legal and non-empty
|
||||
// config.
|
||||
for _, ids := range []map[uint64]struct{}{
|
||||
cfg.Voters.IDs(),
|
||||
cfg.Learners,
|
||||
cfg.LearnersNext,
|
||||
} {
|
||||
for id := range ids {
|
||||
if _, ok := prs[id]; !ok {
|
||||
return fmt.Errorf("no progress for %d", id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any staged learner was staged because it could not be directly added due
|
||||
// to a conflicting voter in the outgoing config.
|
||||
for id := range cfg.LearnersNext {
|
||||
if _, ok := outgoing(cfg.Voters)[id]; !ok {
|
||||
return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
|
||||
}
|
||||
if prs[id].IsLearner {
|
||||
return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
|
||||
}
|
||||
}
|
||||
// Conversely Learners and Voters doesn't intersect at all.
|
||||
for id := range cfg.Learners {
|
||||
if _, ok := outgoing(cfg.Voters)[id]; ok {
|
||||
return fmt.Errorf("%d is in Learners and Voters[1]", id)
|
||||
}
|
||||
if _, ok := incoming(cfg.Voters)[id]; ok {
|
||||
return fmt.Errorf("%d is in Learners and Voters[0]", id)
|
||||
}
|
||||
if !prs[id].IsLearner {
|
||||
return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
|
||||
}
|
||||
}
|
||||
|
||||
if !joint(cfg) {
|
||||
// We enforce that empty maps are nil instead of zero.
|
||||
if outgoing(cfg.Voters) != nil {
|
||||
return fmt.Errorf("Voters[1] must be nil when not joint")
|
||||
}
|
||||
if cfg.LearnersNext != nil {
|
||||
return fmt.Errorf("LearnersNext must be nil when not joint")
|
||||
}
|
||||
if cfg.AutoLeave {
|
||||
return fmt.Errorf("AutoLeave must be false when not joint")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkAndCopy copies the tracker's config and progress map (deeply enough for
|
||||
// the purposes of the Changer) and returns those copies. It returns an error
|
||||
// if checkInvariants does.
|
||||
func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
|
||||
cfg := c.Tracker.Config.Clone()
|
||||
prs := tracker.ProgressMap{}
|
||||
|
||||
for id, pr := range c.Tracker.Progress {
|
||||
// A shallow copy is enough because we only mutate the Learner field.
|
||||
ppr := *pr
|
||||
prs[id] = &ppr
|
||||
}
|
||||
return checkAndReturn(cfg, prs)
|
||||
}
|
||||
|
||||
// checkAndReturn calls checkInvariants on the input and returns either the
|
||||
// resulting error or the input.
|
||||
func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
|
||||
if err := checkInvariants(cfg, prs); err != nil {
|
||||
return tracker.Config{}, tracker.ProgressMap{}, err
|
||||
}
|
||||
return cfg, prs, nil
|
||||
}
|
||||
|
||||
// err returns zero values and an error.
|
||||
func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
|
||||
return tracker.Config{}, nil, err
|
||||
}
|
||||
|
||||
// nilAwareAdd populates a map entry, creating the map if necessary.
|
||||
func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
|
||||
if *m == nil {
|
||||
*m = map[uint64]struct{}{}
|
||||
}
|
||||
(*m)[id] = struct{}{}
|
||||
}
|
||||
|
||||
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
|
||||
func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
|
||||
if *m == nil {
|
||||
return
|
||||
}
|
||||
delete(*m, id)
|
||||
if len(*m) == 0 {
|
||||
*m = nil
|
||||
}
|
||||
}
|
||||
|
||||
// symdiff returns the count of the symmetric difference between the sets of
|
||||
// uint64s, i.e. len( (l - r) \union (r - l)).
|
||||
func symdiff(l, r map[uint64]struct{}) int {
|
||||
var n int
|
||||
pairs := [][2]quorum.MajorityConfig{
|
||||
{l, r}, // count elems in l but not in r
|
||||
{r, l}, // count elems in r but not in l
|
||||
}
|
||||
for _, p := range pairs {
|
||||
for id := range p[0] {
|
||||
if _, ok := p[1][id]; !ok {
|
||||
n++
|
||||
}
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func joint(cfg tracker.Config) bool {
|
||||
return len(outgoing(cfg.Voters)) > 0
|
||||
}
|
||||
|
||||
func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
|
||||
func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
|
||||
func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
|
||||
|
||||
// Describe prints the type and NodeID of the configuration changes as a
|
||||
// space-delimited string.
|
||||
func Describe(ccs ...pb.ConfChangeSingle) string {
|
||||
var buf strings.Builder
|
||||
for _, cc := range ccs {
|
||||
if buf.Len() > 0 {
|
||||
buf.WriteByte(' ')
|
||||
}
|
||||
fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
|
||||
}
|
||||
return buf.String()
|
||||
}
|
155
vendor/go.etcd.io/etcd/raft/confchange/restore.go
generated
vendored
Normal file
155
vendor/go.etcd.io/etcd/raft/confchange/restore.go
generated
vendored
Normal file
@ -0,0 +1,155 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package confchange
|
||||
|
||||
import (
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
"go.etcd.io/etcd/raft/tracker"
|
||||
)
|
||||
|
||||
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
|
||||
// first the config that will become the outgoing one, and then the incoming one, and
|
||||
// b) another slice that, when applied to the config resulted from 1), represents the
|
||||
// ConfState.
|
||||
func toConfChangeSingle(cs pb.ConfState) (out []pb.ConfChangeSingle, in []pb.ConfChangeSingle) {
|
||||
// Example to follow along this code:
|
||||
// voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4)
|
||||
//
|
||||
// This means that before entering the joint config, the configuration
|
||||
// had voters (1 2 4) and perhaps some learners that are already gone.
|
||||
// The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6)
|
||||
// are no longer voters; however 4 is poised to become a learner upon leaving
|
||||
// the joint state.
|
||||
// We can't tell whether 5 was a learner before entering the joint config,
|
||||
// but it doesn't matter (we'll pretend that it wasn't).
|
||||
//
|
||||
// The code below will construct
|
||||
// outgoing = add 1; add 2; add 4; add 6
|
||||
// incoming = remove 1; remove 2; remove 4; remove 6
|
||||
// add 1; add 2; add 3;
|
||||
// add-learner 5;
|
||||
// add-learner 4;
|
||||
//
|
||||
// So, when starting with an empty config, after applying 'outgoing' we have
|
||||
//
|
||||
// quorum=(1 2 4 6)
|
||||
//
|
||||
// From which we enter a joint state via 'incoming'
|
||||
//
|
||||
// quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4)
|
||||
//
|
||||
// as desired.
|
||||
|
||||
for _, id := range cs.VotersOutgoing {
|
||||
// If there are outgoing voters, first add them one by one so that the
|
||||
// (non-joint) config has them all.
|
||||
out = append(out, pb.ConfChangeSingle{
|
||||
Type: pb.ConfChangeAddNode,
|
||||
NodeID: id,
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
// We're done constructing the outgoing slice, now on to the incoming one
|
||||
// (which will apply on top of the config created by the outgoing slice).
|
||||
|
||||
// First, we'll remove all of the outgoing voters.
|
||||
for _, id := range cs.VotersOutgoing {
|
||||
in = append(in, pb.ConfChangeSingle{
|
||||
Type: pb.ConfChangeRemoveNode,
|
||||
NodeID: id,
|
||||
})
|
||||
}
|
||||
// Then we'll add the incoming voters and learners.
|
||||
for _, id := range cs.Voters {
|
||||
in = append(in, pb.ConfChangeSingle{
|
||||
Type: pb.ConfChangeAddNode,
|
||||
NodeID: id,
|
||||
})
|
||||
}
|
||||
for _, id := range cs.Learners {
|
||||
in = append(in, pb.ConfChangeSingle{
|
||||
Type: pb.ConfChangeAddLearnerNode,
|
||||
NodeID: id,
|
||||
})
|
||||
}
|
||||
// Same for LearnersNext; these are nodes we want to be learners but which
|
||||
// are currently voters in the outgoing config.
|
||||
for _, id := range cs.LearnersNext {
|
||||
in = append(in, pb.ConfChangeSingle{
|
||||
Type: pb.ConfChangeAddLearnerNode,
|
||||
NodeID: id,
|
||||
})
|
||||
}
|
||||
return out, in
|
||||
}
|
||||
|
||||
func chain(chg Changer, ops ...func(Changer) (tracker.Config, tracker.ProgressMap, error)) (tracker.Config, tracker.ProgressMap, error) {
|
||||
for _, op := range ops {
|
||||
cfg, prs, err := op(chg)
|
||||
if err != nil {
|
||||
return tracker.Config{}, nil, err
|
||||
}
|
||||
chg.Tracker.Config = cfg
|
||||
chg.Tracker.Progress = prs
|
||||
}
|
||||
return chg.Tracker.Config, chg.Tracker.Progress, nil
|
||||
}
|
||||
|
||||
// Restore takes a Changer (which must represent an empty configuration), and
|
||||
// runs a sequence of changes enacting the configuration described in the
|
||||
// ConfState.
|
||||
//
|
||||
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
|
||||
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
|
||||
// this can just take LastIndex and MaxInflight directly instead and cook up
|
||||
// the results from that alone.
|
||||
func Restore(chg Changer, cs pb.ConfState) (tracker.Config, tracker.ProgressMap, error) {
|
||||
outgoing, incoming := toConfChangeSingle(cs)
|
||||
|
||||
var ops []func(Changer) (tracker.Config, tracker.ProgressMap, error)
|
||||
|
||||
if len(outgoing) == 0 {
|
||||
// No outgoing config, so just apply the incoming changes one by one.
|
||||
for _, cc := range incoming {
|
||||
cc := cc // loop-local copy
|
||||
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
|
||||
return chg.Simple(cc)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
// The ConfState describes a joint configuration.
|
||||
//
|
||||
// First, apply all of the changes of the outgoing config one by one, so
|
||||
// that it temporarily becomes the incoming active config. For example,
|
||||
// if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&().
|
||||
for _, cc := range outgoing {
|
||||
cc := cc // loop-local copy
|
||||
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
|
||||
return chg.Simple(cc)
|
||||
})
|
||||
}
|
||||
// Now enter the joint state, which rotates the above additions into the
|
||||
// outgoing config, and adds the incoming config in. Continuing the
|
||||
// example above, we'd get (1 2 3)&(2 3 4), i.e. the incoming operations
|
||||
// would be removing 2,3,4 and then adding in 1,2,3 while transitioning
|
||||
// into a joint state.
|
||||
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
|
||||
return chg.EnterJoint(cs.AutoLeave, incoming...)
|
||||
})
|
||||
}
|
||||
|
||||
return chain(chg, ops...)
|
||||
}
|
300
vendor/go.etcd.io/etcd/raft/doc.go
generated
vendored
Normal file
300
vendor/go.etcd.io/etcd/raft/doc.go
generated
vendored
Normal file
@ -0,0 +1,300 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
/*
|
||||
Package raft sends and receives messages in the Protocol Buffer format
|
||||
defined in the raftpb package.
|
||||
|
||||
Raft is a protocol with which a cluster of nodes can maintain a replicated state machine.
|
||||
The state machine is kept in sync through the use of a replicated log.
|
||||
For more details on Raft, see "In Search of an Understandable Consensus Algorithm"
|
||||
(https://raft.github.io/raft.pdf) by Diego Ongaro and John Ousterhout.
|
||||
|
||||
A simple example application, _raftexample_, is also available to help illustrate
|
||||
how to use this package in practice:
|
||||
https://github.com/etcd-io/etcd/tree/master/contrib/raftexample
|
||||
|
||||
Usage
|
||||
|
||||
The primary object in raft is a Node. You either start a Node from scratch
|
||||
using raft.StartNode or start a Node from some initial state using raft.RestartNode.
|
||||
|
||||
To start a node from scratch:
|
||||
|
||||
storage := raft.NewMemoryStorage()
|
||||
c := &Config{
|
||||
ID: 0x01,
|
||||
ElectionTick: 10,
|
||||
HeartbeatTick: 1,
|
||||
Storage: storage,
|
||||
MaxSizePerMsg: 4096,
|
||||
MaxInflightMsgs: 256,
|
||||
}
|
||||
n := raft.StartNode(c, []raft.Peer{{ID: 0x02}, {ID: 0x03}})
|
||||
|
||||
To restart a node from previous state:
|
||||
|
||||
storage := raft.NewMemoryStorage()
|
||||
|
||||
// recover the in-memory storage from persistent
|
||||
// snapshot, state and entries.
|
||||
storage.ApplySnapshot(snapshot)
|
||||
storage.SetHardState(state)
|
||||
storage.Append(entries)
|
||||
|
||||
c := &Config{
|
||||
ID: 0x01,
|
||||
ElectionTick: 10,
|
||||
HeartbeatTick: 1,
|
||||
Storage: storage,
|
||||
MaxSizePerMsg: 4096,
|
||||
MaxInflightMsgs: 256,
|
||||
}
|
||||
|
||||
// restart raft without peer information.
|
||||
// peer information is already included in the storage.
|
||||
n := raft.RestartNode(c)
|
||||
|
||||
Now that you are holding onto a Node you have a few responsibilities:
|
||||
|
||||
First, you must read from the Node.Ready() channel and process the updates
|
||||
it contains. These steps may be performed in parallel, except as noted in step
|
||||
2.
|
||||
|
||||
1. Write HardState, Entries, and Snapshot to persistent storage if they are
|
||||
not empty. Note that when writing an Entry with Index i, any
|
||||
previously-persisted entries with Index >= i must be discarded.
|
||||
|
||||
2. Send all Messages to the nodes named in the To field. It is important that
|
||||
no messages be sent until the latest HardState has been persisted to disk,
|
||||
and all Entries written by any previous Ready batch (Messages may be sent while
|
||||
entries from the same batch are being persisted). To reduce the I/O latency, an
|
||||
optimization can be applied to make leader write to disk in parallel with its
|
||||
followers (as explained at section 10.2.1 in Raft thesis). If any Message has type
|
||||
MsgSnap, call Node.ReportSnapshot() after it has been sent (these messages may be
|
||||
large).
|
||||
|
||||
Note: Marshalling messages is not thread-safe; it is important that you
|
||||
make sure that no new entries are persisted while marshalling.
|
||||
The easiest way to achieve this is to serialize the messages directly inside
|
||||
your main raft loop.
|
||||
|
||||
3. Apply Snapshot (if any) and CommittedEntries to the state machine.
|
||||
If any committed Entry has Type EntryConfChange, call Node.ApplyConfChange()
|
||||
to apply it to the node. The configuration change may be cancelled at this point
|
||||
by setting the NodeID field to zero before calling ApplyConfChange
|
||||
(but ApplyConfChange must be called one way or the other, and the decision to cancel
|
||||
must be based solely on the state machine and not external information such as
|
||||
the observed health of the node).
|
||||
|
||||
4. Call Node.Advance() to signal readiness for the next batch of updates.
|
||||
This may be done at any time after step 1, although all updates must be processed
|
||||
in the order they were returned by Ready.
|
||||
|
||||
Second, all persisted log entries must be made available via an
|
||||
implementation of the Storage interface. The provided MemoryStorage
|
||||
type can be used for this (if you repopulate its state upon a
|
||||
restart), or you can supply your own disk-backed implementation.
|
||||
|
||||
Third, when you receive a message from another node, pass it to Node.Step:
|
||||
|
||||
func recvRaftRPC(ctx context.Context, m raftpb.Message) {
|
||||
n.Step(ctx, m)
|
||||
}
|
||||
|
||||
Finally, you need to call Node.Tick() at regular intervals (probably
|
||||
via a time.Ticker). Raft has two important timeouts: heartbeat and the
|
||||
election timeout. However, internally to the raft package time is
|
||||
represented by an abstract "tick".
|
||||
|
||||
The total state machine handling loop will look something like this:
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.Ticker:
|
||||
n.Tick()
|
||||
case rd := <-s.Node.Ready():
|
||||
saveToStorage(rd.State, rd.Entries, rd.Snapshot)
|
||||
send(rd.Messages)
|
||||
if !raft.IsEmptySnap(rd.Snapshot) {
|
||||
processSnapshot(rd.Snapshot)
|
||||
}
|
||||
for _, entry := range rd.CommittedEntries {
|
||||
process(entry)
|
||||
if entry.Type == raftpb.EntryConfChange {
|
||||
var cc raftpb.ConfChange
|
||||
cc.Unmarshal(entry.Data)
|
||||
s.Node.ApplyConfChange(cc)
|
||||
}
|
||||
}
|
||||
s.Node.Advance()
|
||||
case <-s.done:
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
To propose changes to the state machine from your node take your application
|
||||
data, serialize it into a byte slice and call:
|
||||
|
||||
n.Propose(ctx, data)
|
||||
|
||||
If the proposal is committed, data will appear in committed entries with type
|
||||
raftpb.EntryNormal. There is no guarantee that a proposed command will be
|
||||
committed; you may have to re-propose after a timeout.
|
||||
|
||||
To add or remove a node in a cluster, build ConfChange struct 'cc' and call:
|
||||
|
||||
n.ProposeConfChange(ctx, cc)
|
||||
|
||||
After config change is committed, some committed entry with type
|
||||
raftpb.EntryConfChange will be returned. You must apply it to node through:
|
||||
|
||||
var cc raftpb.ConfChange
|
||||
cc.Unmarshal(data)
|
||||
n.ApplyConfChange(cc)
|
||||
|
||||
Note: An ID represents a unique node in a cluster for all time. A
|
||||
given ID MUST be used only once even if the old node has been removed.
|
||||
This means that for example IP addresses make poor node IDs since they
|
||||
may be reused. Node IDs must be non-zero.
|
||||
|
||||
Implementation notes
|
||||
|
||||
This implementation is up to date with the final Raft thesis
|
||||
(https://github.com/ongardie/dissertation/blob/master/stanford.pdf), although our
|
||||
implementation of the membership change protocol differs somewhat from
|
||||
that described in chapter 4. The key invariant that membership changes
|
||||
happen one node at a time is preserved, but in our implementation the
|
||||
membership change takes effect when its entry is applied, not when it
|
||||
is added to the log (so the entry is committed under the old
|
||||
membership instead of the new). This is equivalent in terms of safety,
|
||||
since the old and new configurations are guaranteed to overlap.
|
||||
|
||||
To ensure that we do not attempt to commit two membership changes at
|
||||
once by matching log positions (which would be unsafe since they
|
||||
should have different quorum requirements), we simply disallow any
|
||||
proposed membership change while any uncommitted change appears in
|
||||
the leader's log.
|
||||
|
||||
This approach introduces a problem when you try to remove a member
|
||||
from a two-member cluster: If one of the members dies before the
|
||||
other one receives the commit of the confchange entry, then the member
|
||||
cannot be removed any more since the cluster cannot make progress.
|
||||
For this reason it is highly recommended to use three or more nodes in
|
||||
every cluster.
|
||||
|
||||
MessageType
|
||||
|
||||
Package raft sends and receives message in Protocol Buffer format (defined
|
||||
in raftpb package). Each state (follower, candidate, leader) implements its
|
||||
own 'step' method ('stepFollower', 'stepCandidate', 'stepLeader') when
|
||||
advancing with the given raftpb.Message. Each step is determined by its
|
||||
raftpb.MessageType. Note that every step is checked by one common method
|
||||
'Step' that safety-checks the terms of node and incoming message to prevent
|
||||
stale log entries:
|
||||
|
||||
'MsgHup' is used for election. If a node is a follower or candidate, the
|
||||
'tick' function in 'raft' struct is set as 'tickElection'. If a follower or
|
||||
candidate has not received any heartbeat before the election timeout, it
|
||||
passes 'MsgHup' to its Step method and becomes (or remains) a candidate to
|
||||
start a new election.
|
||||
|
||||
'MsgBeat' is an internal type that signals the leader to send a heartbeat of
|
||||
the 'MsgHeartbeat' type. If a node is a leader, the 'tick' function in
|
||||
the 'raft' struct is set as 'tickHeartbeat', and triggers the leader to
|
||||
send periodic 'MsgHeartbeat' messages to its followers.
|
||||
|
||||
'MsgProp' proposes to append data to its log entries. This is a special
|
||||
type to redirect proposals to leader. Therefore, send method overwrites
|
||||
raftpb.Message's term with its HardState's term to avoid attaching its
|
||||
local term to 'MsgProp'. When 'MsgProp' is passed to the leader's 'Step'
|
||||
method, the leader first calls the 'appendEntry' method to append entries
|
||||
to its log, and then calls 'bcastAppend' method to send those entries to
|
||||
its peers. When passed to candidate, 'MsgProp' is dropped. When passed to
|
||||
follower, 'MsgProp' is stored in follower's mailbox(msgs) by the send
|
||||
method. It is stored with sender's ID and later forwarded to leader by
|
||||
rafthttp package.
|
||||
|
||||
'MsgApp' contains log entries to replicate. A leader calls bcastAppend,
|
||||
which calls sendAppend, which sends soon-to-be-replicated logs in 'MsgApp'
|
||||
type. When 'MsgApp' is passed to candidate's Step method, candidate reverts
|
||||
back to follower, because it indicates that there is a valid leader sending
|
||||
'MsgApp' messages. Candidate and follower respond to this message in
|
||||
'MsgAppResp' type.
|
||||
|
||||
'MsgAppResp' is response to log replication request('MsgApp'). When
|
||||
'MsgApp' is passed to candidate or follower's Step method, it responds by
|
||||
calling 'handleAppendEntries' method, which sends 'MsgAppResp' to raft
|
||||
mailbox.
|
||||
|
||||
'MsgVote' requests votes for election. When a node is a follower or
|
||||
candidate and 'MsgHup' is passed to its Step method, then the node calls
|
||||
'campaign' method to campaign itself to become a leader. Once 'campaign'
|
||||
method is called, the node becomes candidate and sends 'MsgVote' to peers
|
||||
in cluster to request votes. When passed to leader or candidate's Step
|
||||
method and the message's Term is lower than leader's or candidate's,
|
||||
'MsgVote' will be rejected ('MsgVoteResp' is returned with Reject true).
|
||||
If leader or candidate receives 'MsgVote' with higher term, it will revert
|
||||
back to follower. When 'MsgVote' is passed to follower, it votes for the
|
||||
sender only when sender's last term is greater than MsgVote's term or
|
||||
sender's last term is equal to MsgVote's term but sender's last committed
|
||||
index is greater than or equal to follower's.
|
||||
|
||||
'MsgVoteResp' contains responses from voting request. When 'MsgVoteResp' is
|
||||
passed to candidate, the candidate calculates how many votes it has won. If
|
||||
it's more than majority (quorum), it becomes leader and calls 'bcastAppend'.
|
||||
If candidate receives majority of votes of denials, it reverts back to
|
||||
follower.
|
||||
|
||||
'MsgPreVote' and 'MsgPreVoteResp' are used in an optional two-phase election
|
||||
protocol. When Config.PreVote is true, a pre-election is carried out first
|
||||
(using the same rules as a regular election), and no node increases its term
|
||||
number unless the pre-election indicates that the campaigning node would win.
|
||||
This minimizes disruption when a partitioned node rejoins the cluster.
|
||||
|
||||
'MsgSnap' requests to install a snapshot message. When a node has just
|
||||
become a leader or the leader receives 'MsgProp' message, it calls
|
||||
'bcastAppend' method, which then calls 'sendAppend' method to each
|
||||
follower. In 'sendAppend', if a leader fails to get term or entries,
|
||||
the leader requests snapshot by sending 'MsgSnap' type message.
|
||||
|
||||
'MsgSnapStatus' tells the result of snapshot install message. When a
|
||||
follower rejected 'MsgSnap', it indicates the snapshot request with
|
||||
'MsgSnap' had failed from network issues which causes the network layer
|
||||
to fail to send out snapshots to its followers. Then leader considers
|
||||
follower's progress as probe. When 'MsgSnap' were not rejected, it
|
||||
indicates that the snapshot succeeded and the leader sets follower's
|
||||
progress to probe and resumes its log replication.
|
||||
|
||||
'MsgHeartbeat' sends heartbeat from leader. When 'MsgHeartbeat' is passed
|
||||
to candidate and message's term is higher than candidate's, the candidate
|
||||
reverts back to follower and updates its committed index from the one in
|
||||
this heartbeat. And it sends the message to its mailbox. When
|
||||
'MsgHeartbeat' is passed to follower's Step method and message's term is
|
||||
higher than follower's, the follower updates its leaderID with the ID
|
||||
from the message.
|
||||
|
||||
'MsgHeartbeatResp' is a response to 'MsgHeartbeat'. When 'MsgHeartbeatResp'
|
||||
is passed to leader's Step method, the leader knows which follower
|
||||
responded. And only when the leader's last committed index is greater than
|
||||
follower's Match index, the leader runs 'sendAppend` method.
|
||||
|
||||
'MsgUnreachable' tells that request(message) wasn't delivered. When
|
||||
'MsgUnreachable' is passed to leader's Step method, the leader discovers
|
||||
that the follower that sent this 'MsgUnreachable' is not reachable, often
|
||||
indicating 'MsgApp' is lost. When follower's progress state is replicate,
|
||||
the leader sets it back to probe.
|
||||
|
||||
*/
|
||||
package raft
|
372
vendor/go.etcd.io/etcd/raft/log.go
generated
vendored
Normal file
372
vendor/go.etcd.io/etcd/raft/log.go
generated
vendored
Normal file
@ -0,0 +1,372 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
type raftLog struct {
|
||||
// storage contains all stable entries since the last snapshot.
|
||||
storage Storage
|
||||
|
||||
// unstable contains all unstable entries and snapshot.
|
||||
// they will be saved into storage.
|
||||
unstable unstable
|
||||
|
||||
// committed is the highest log position that is known to be in
|
||||
// stable storage on a quorum of nodes.
|
||||
committed uint64
|
||||
// applied is the highest log position that the application has
|
||||
// been instructed to apply to its state machine.
|
||||
// Invariant: applied <= committed
|
||||
applied uint64
|
||||
|
||||
logger Logger
|
||||
|
||||
// maxNextEntsSize is the maximum number aggregate byte size of the messages
|
||||
// returned from calls to nextEnts.
|
||||
maxNextEntsSize uint64
|
||||
}
|
||||
|
||||
// newLog returns log using the given storage and default options. It
|
||||
// recovers the log to the state that it just commits and applies the
|
||||
// latest snapshot.
|
||||
func newLog(storage Storage, logger Logger) *raftLog {
|
||||
return newLogWithSize(storage, logger, noLimit)
|
||||
}
|
||||
|
||||
// newLogWithSize returns a log using the given storage and max
|
||||
// message size.
|
||||
func newLogWithSize(storage Storage, logger Logger, maxNextEntsSize uint64) *raftLog {
|
||||
if storage == nil {
|
||||
log.Panic("storage must not be nil")
|
||||
}
|
||||
log := &raftLog{
|
||||
storage: storage,
|
||||
logger: logger,
|
||||
maxNextEntsSize: maxNextEntsSize,
|
||||
}
|
||||
firstIndex, err := storage.FirstIndex()
|
||||
if err != nil {
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
lastIndex, err := storage.LastIndex()
|
||||
if err != nil {
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
log.unstable.offset = lastIndex + 1
|
||||
log.unstable.logger = logger
|
||||
// Initialize our committed and applied pointers to the time of the last compaction.
|
||||
log.committed = firstIndex - 1
|
||||
log.applied = firstIndex - 1
|
||||
|
||||
return log
|
||||
}
|
||||
|
||||
func (l *raftLog) String() string {
|
||||
return fmt.Sprintf("committed=%d, applied=%d, unstable.offset=%d, len(unstable.Entries)=%d", l.committed, l.applied, l.unstable.offset, len(l.unstable.entries))
|
||||
}
|
||||
|
||||
// maybeAppend returns (0, false) if the entries cannot be appended. Otherwise,
|
||||
// it returns (last index of new entries, true).
|
||||
func (l *raftLog) maybeAppend(index, logTerm, committed uint64, ents ...pb.Entry) (lastnewi uint64, ok bool) {
|
||||
if l.matchTerm(index, logTerm) {
|
||||
lastnewi = index + uint64(len(ents))
|
||||
ci := l.findConflict(ents)
|
||||
switch {
|
||||
case ci == 0:
|
||||
case ci <= l.committed:
|
||||
l.logger.Panicf("entry %d conflict with committed entry [committed(%d)]", ci, l.committed)
|
||||
default:
|
||||
offset := index + 1
|
||||
l.append(ents[ci-offset:]...)
|
||||
}
|
||||
l.commitTo(min(committed, lastnewi))
|
||||
return lastnewi, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func (l *raftLog) append(ents ...pb.Entry) uint64 {
|
||||
if len(ents) == 0 {
|
||||
return l.lastIndex()
|
||||
}
|
||||
if after := ents[0].Index - 1; after < l.committed {
|
||||
l.logger.Panicf("after(%d) is out of range [committed(%d)]", after, l.committed)
|
||||
}
|
||||
l.unstable.truncateAndAppend(ents)
|
||||
return l.lastIndex()
|
||||
}
|
||||
|
||||
// findConflict finds the index of the conflict.
|
||||
// It returns the first pair of conflicting entries between the existing
|
||||
// entries and the given entries, if there are any.
|
||||
// If there is no conflicting entries, and the existing entries contains
|
||||
// all the given entries, zero will be returned.
|
||||
// If there is no conflicting entries, but the given entries contains new
|
||||
// entries, the index of the first new entry will be returned.
|
||||
// An entry is considered to be conflicting if it has the same index but
|
||||
// a different term.
|
||||
// The first entry MUST have an index equal to the argument 'from'.
|
||||
// The index of the given entries MUST be continuously increasing.
|
||||
func (l *raftLog) findConflict(ents []pb.Entry) uint64 {
|
||||
for _, ne := range ents {
|
||||
if !l.matchTerm(ne.Index, ne.Term) {
|
||||
if ne.Index <= l.lastIndex() {
|
||||
l.logger.Infof("found conflict at index %d [existing term: %d, conflicting term: %d]",
|
||||
ne.Index, l.zeroTermOnErrCompacted(l.term(ne.Index)), ne.Term)
|
||||
}
|
||||
return ne.Index
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (l *raftLog) unstableEntries() []pb.Entry {
|
||||
if len(l.unstable.entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
return l.unstable.entries
|
||||
}
|
||||
|
||||
// nextEnts returns all the available entries for execution.
|
||||
// If applied is smaller than the index of snapshot, it returns all committed
|
||||
// entries after the index of snapshot.
|
||||
func (l *raftLog) nextEnts() (ents []pb.Entry) {
|
||||
off := max(l.applied+1, l.firstIndex())
|
||||
if l.committed+1 > off {
|
||||
ents, err := l.slice(off, l.committed+1, l.maxNextEntsSize)
|
||||
if err != nil {
|
||||
l.logger.Panicf("unexpected error when getting unapplied entries (%v)", err)
|
||||
}
|
||||
return ents
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// hasNextEnts returns if there is any available entries for execution. This
|
||||
// is a fast check without heavy raftLog.slice() in raftLog.nextEnts().
|
||||
func (l *raftLog) hasNextEnts() bool {
|
||||
off := max(l.applied+1, l.firstIndex())
|
||||
return l.committed+1 > off
|
||||
}
|
||||
|
||||
func (l *raftLog) snapshot() (pb.Snapshot, error) {
|
||||
if l.unstable.snapshot != nil {
|
||||
return *l.unstable.snapshot, nil
|
||||
}
|
||||
return l.storage.Snapshot()
|
||||
}
|
||||
|
||||
func (l *raftLog) firstIndex() uint64 {
|
||||
if i, ok := l.unstable.maybeFirstIndex(); ok {
|
||||
return i
|
||||
}
|
||||
index, err := l.storage.FirstIndex()
|
||||
if err != nil {
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
func (l *raftLog) lastIndex() uint64 {
|
||||
if i, ok := l.unstable.maybeLastIndex(); ok {
|
||||
return i
|
||||
}
|
||||
i, err := l.storage.LastIndex()
|
||||
if err != nil {
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
func (l *raftLog) commitTo(tocommit uint64) {
|
||||
// never decrease commit
|
||||
if l.committed < tocommit {
|
||||
if l.lastIndex() < tocommit {
|
||||
l.logger.Panicf("tocommit(%d) is out of range [lastIndex(%d)]. Was the raft log corrupted, truncated, or lost?", tocommit, l.lastIndex())
|
||||
}
|
||||
l.committed = tocommit
|
||||
}
|
||||
}
|
||||
|
||||
func (l *raftLog) appliedTo(i uint64) {
|
||||
if i == 0 {
|
||||
return
|
||||
}
|
||||
if l.committed < i || i < l.applied {
|
||||
l.logger.Panicf("applied(%d) is out of range [prevApplied(%d), committed(%d)]", i, l.applied, l.committed)
|
||||
}
|
||||
l.applied = i
|
||||
}
|
||||
|
||||
func (l *raftLog) stableTo(i, t uint64) { l.unstable.stableTo(i, t) }
|
||||
|
||||
func (l *raftLog) stableSnapTo(i uint64) { l.unstable.stableSnapTo(i) }
|
||||
|
||||
func (l *raftLog) lastTerm() uint64 {
|
||||
t, err := l.term(l.lastIndex())
|
||||
if err != nil {
|
||||
l.logger.Panicf("unexpected error when getting the last term (%v)", err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
func (l *raftLog) term(i uint64) (uint64, error) {
|
||||
// the valid term range is [index of dummy entry, last index]
|
||||
dummyIndex := l.firstIndex() - 1
|
||||
if i < dummyIndex || i > l.lastIndex() {
|
||||
// TODO: return an error instead?
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
if t, ok := l.unstable.maybeTerm(i); ok {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
t, err := l.storage.Term(i)
|
||||
if err == nil {
|
||||
return t, nil
|
||||
}
|
||||
if err == ErrCompacted || err == ErrUnavailable {
|
||||
return 0, err
|
||||
}
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
|
||||
func (l *raftLog) entries(i, maxsize uint64) ([]pb.Entry, error) {
|
||||
if i > l.lastIndex() {
|
||||
return nil, nil
|
||||
}
|
||||
return l.slice(i, l.lastIndex()+1, maxsize)
|
||||
}
|
||||
|
||||
// allEntries returns all entries in the log.
|
||||
func (l *raftLog) allEntries() []pb.Entry {
|
||||
ents, err := l.entries(l.firstIndex(), noLimit)
|
||||
if err == nil {
|
||||
return ents
|
||||
}
|
||||
if err == ErrCompacted { // try again if there was a racing compaction
|
||||
return l.allEntries()
|
||||
}
|
||||
// TODO (xiangli): handle error?
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// isUpToDate determines if the given (lastIndex,term) log is more up-to-date
|
||||
// by comparing the index and term of the last entries in the existing logs.
|
||||
// If the logs have last entries with different terms, then the log with the
|
||||
// later term is more up-to-date. If the logs end with the same term, then
|
||||
// whichever log has the larger lastIndex is more up-to-date. If the logs are
|
||||
// the same, the given log is up-to-date.
|
||||
func (l *raftLog) isUpToDate(lasti, term uint64) bool {
|
||||
return term > l.lastTerm() || (term == l.lastTerm() && lasti >= l.lastIndex())
|
||||
}
|
||||
|
||||
func (l *raftLog) matchTerm(i, term uint64) bool {
|
||||
t, err := l.term(i)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return t == term
|
||||
}
|
||||
|
||||
func (l *raftLog) maybeCommit(maxIndex, term uint64) bool {
|
||||
if maxIndex > l.committed && l.zeroTermOnErrCompacted(l.term(maxIndex)) == term {
|
||||
l.commitTo(maxIndex)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (l *raftLog) restore(s pb.Snapshot) {
|
||||
l.logger.Infof("log [%s] starts to restore snapshot [index: %d, term: %d]", l, s.Metadata.Index, s.Metadata.Term)
|
||||
l.committed = s.Metadata.Index
|
||||
l.unstable.restore(s)
|
||||
}
|
||||
|
||||
// slice returns a slice of log entries from lo through hi-1, inclusive.
|
||||
func (l *raftLog) slice(lo, hi, maxSize uint64) ([]pb.Entry, error) {
|
||||
err := l.mustCheckOutOfBounds(lo, hi)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if lo == hi {
|
||||
return nil, nil
|
||||
}
|
||||
var ents []pb.Entry
|
||||
if lo < l.unstable.offset {
|
||||
storedEnts, err := l.storage.Entries(lo, min(hi, l.unstable.offset), maxSize)
|
||||
if err == ErrCompacted {
|
||||
return nil, err
|
||||
} else if err == ErrUnavailable {
|
||||
l.logger.Panicf("entries[%d:%d) is unavailable from storage", lo, min(hi, l.unstable.offset))
|
||||
} else if err != nil {
|
||||
panic(err) // TODO(bdarnell)
|
||||
}
|
||||
|
||||
// check if ents has reached the size limitation
|
||||
if uint64(len(storedEnts)) < min(hi, l.unstable.offset)-lo {
|
||||
return storedEnts, nil
|
||||
}
|
||||
|
||||
ents = storedEnts
|
||||
}
|
||||
if hi > l.unstable.offset {
|
||||
unstable := l.unstable.slice(max(lo, l.unstable.offset), hi)
|
||||
if len(ents) > 0 {
|
||||
combined := make([]pb.Entry, len(ents)+len(unstable))
|
||||
n := copy(combined, ents)
|
||||
copy(combined[n:], unstable)
|
||||
ents = combined
|
||||
} else {
|
||||
ents = unstable
|
||||
}
|
||||
}
|
||||
return limitSize(ents, maxSize), nil
|
||||
}
|
||||
|
||||
// l.firstIndex <= lo <= hi <= l.firstIndex + len(l.entries)
|
||||
func (l *raftLog) mustCheckOutOfBounds(lo, hi uint64) error {
|
||||
if lo > hi {
|
||||
l.logger.Panicf("invalid slice %d > %d", lo, hi)
|
||||
}
|
||||
fi := l.firstIndex()
|
||||
if lo < fi {
|
||||
return ErrCompacted
|
||||
}
|
||||
|
||||
length := l.lastIndex() + 1 - fi
|
||||
if lo < fi || hi > fi+length {
|
||||
l.logger.Panicf("slice[%d,%d) out of bound [%d,%d]", lo, hi, fi, l.lastIndex())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *raftLog) zeroTermOnErrCompacted(t uint64, err error) uint64 {
|
||||
if err == nil {
|
||||
return t
|
||||
}
|
||||
if err == ErrCompacted {
|
||||
return 0
|
||||
}
|
||||
l.logger.Panicf("unexpected error (%v)", err)
|
||||
return 0
|
||||
}
|
157
vendor/go.etcd.io/etcd/raft/log_unstable.go
generated
vendored
Normal file
157
vendor/go.etcd.io/etcd/raft/log_unstable.go
generated
vendored
Normal file
@ -0,0 +1,157 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import pb "go.etcd.io/etcd/raft/raftpb"
|
||||
|
||||
// unstable.entries[i] has raft log position i+unstable.offset.
|
||||
// Note that unstable.offset may be less than the highest log
|
||||
// position in storage; this means that the next write to storage
|
||||
// might need to truncate the log before persisting unstable.entries.
|
||||
type unstable struct {
|
||||
// the incoming unstable snapshot, if any.
|
||||
snapshot *pb.Snapshot
|
||||
// all entries that have not yet been written to storage.
|
||||
entries []pb.Entry
|
||||
offset uint64
|
||||
|
||||
logger Logger
|
||||
}
|
||||
|
||||
// maybeFirstIndex returns the index of the first possible entry in entries
|
||||
// if it has a snapshot.
|
||||
func (u *unstable) maybeFirstIndex() (uint64, bool) {
|
||||
if u.snapshot != nil {
|
||||
return u.snapshot.Metadata.Index + 1, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// maybeLastIndex returns the last index if it has at least one
|
||||
// unstable entry or snapshot.
|
||||
func (u *unstable) maybeLastIndex() (uint64, bool) {
|
||||
if l := len(u.entries); l != 0 {
|
||||
return u.offset + uint64(l) - 1, true
|
||||
}
|
||||
if u.snapshot != nil {
|
||||
return u.snapshot.Metadata.Index, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// maybeTerm returns the term of the entry at index i, if there
|
||||
// is any.
|
||||
func (u *unstable) maybeTerm(i uint64) (uint64, bool) {
|
||||
if i < u.offset {
|
||||
if u.snapshot != nil && u.snapshot.Metadata.Index == i {
|
||||
return u.snapshot.Metadata.Term, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
last, ok := u.maybeLastIndex()
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
if i > last {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
return u.entries[i-u.offset].Term, true
|
||||
}
|
||||
|
||||
func (u *unstable) stableTo(i, t uint64) {
|
||||
gt, ok := u.maybeTerm(i)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
// if i < offset, term is matched with the snapshot
|
||||
// only update the unstable entries if term is matched with
|
||||
// an unstable entry.
|
||||
if gt == t && i >= u.offset {
|
||||
u.entries = u.entries[i+1-u.offset:]
|
||||
u.offset = i + 1
|
||||
u.shrinkEntriesArray()
|
||||
}
|
||||
}
|
||||
|
||||
// shrinkEntriesArray discards the underlying array used by the entries slice
|
||||
// if most of it isn't being used. This avoids holding references to a bunch of
|
||||
// potentially large entries that aren't needed anymore. Simply clearing the
|
||||
// entries wouldn't be safe because clients might still be using them.
|
||||
func (u *unstable) shrinkEntriesArray() {
|
||||
// We replace the array if we're using less than half of the space in
|
||||
// it. This number is fairly arbitrary, chosen as an attempt to balance
|
||||
// memory usage vs number of allocations. It could probably be improved
|
||||
// with some focused tuning.
|
||||
const lenMultiple = 2
|
||||
if len(u.entries) == 0 {
|
||||
u.entries = nil
|
||||
} else if len(u.entries)*lenMultiple < cap(u.entries) {
|
||||
newEntries := make([]pb.Entry, len(u.entries))
|
||||
copy(newEntries, u.entries)
|
||||
u.entries = newEntries
|
||||
}
|
||||
}
|
||||
|
||||
func (u *unstable) stableSnapTo(i uint64) {
|
||||
if u.snapshot != nil && u.snapshot.Metadata.Index == i {
|
||||
u.snapshot = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (u *unstable) restore(s pb.Snapshot) {
|
||||
u.offset = s.Metadata.Index + 1
|
||||
u.entries = nil
|
||||
u.snapshot = &s
|
||||
}
|
||||
|
||||
func (u *unstable) truncateAndAppend(ents []pb.Entry) {
|
||||
after := ents[0].Index
|
||||
switch {
|
||||
case after == u.offset+uint64(len(u.entries)):
|
||||
// after is the next index in the u.entries
|
||||
// directly append
|
||||
u.entries = append(u.entries, ents...)
|
||||
case after <= u.offset:
|
||||
u.logger.Infof("replace the unstable entries from index %d", after)
|
||||
// The log is being truncated to before our current offset
|
||||
// portion, so set the offset and replace the entries
|
||||
u.offset = after
|
||||
u.entries = ents
|
||||
default:
|
||||
// truncate to after and copy to u.entries
|
||||
// then append
|
||||
u.logger.Infof("truncate the unstable entries before index %d", after)
|
||||
u.entries = append([]pb.Entry{}, u.slice(u.offset, after)...)
|
||||
u.entries = append(u.entries, ents...)
|
||||
}
|
||||
}
|
||||
|
||||
func (u *unstable) slice(lo uint64, hi uint64) []pb.Entry {
|
||||
u.mustCheckOutOfBounds(lo, hi)
|
||||
return u.entries[lo-u.offset : hi-u.offset]
|
||||
}
|
||||
|
||||
// u.offset <= lo <= hi <= u.offset+len(u.entries)
|
||||
func (u *unstable) mustCheckOutOfBounds(lo, hi uint64) {
|
||||
if lo > hi {
|
||||
u.logger.Panicf("invalid unstable.slice %d > %d", lo, hi)
|
||||
}
|
||||
upper := u.offset + uint64(len(u.entries))
|
||||
if lo < u.offset || hi > upper {
|
||||
u.logger.Panicf("unstable.slice[%d,%d) out of bound [%d,%d]", lo, hi, u.offset, upper)
|
||||
}
|
||||
}
|
132
vendor/go.etcd.io/etcd/raft/logger.go
generated
vendored
Normal file
132
vendor/go.etcd.io/etcd/raft/logger.go
generated
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Logger interface {
|
||||
Debug(v ...interface{})
|
||||
Debugf(format string, v ...interface{})
|
||||
|
||||
Error(v ...interface{})
|
||||
Errorf(format string, v ...interface{})
|
||||
|
||||
Info(v ...interface{})
|
||||
Infof(format string, v ...interface{})
|
||||
|
||||
Warning(v ...interface{})
|
||||
Warningf(format string, v ...interface{})
|
||||
|
||||
Fatal(v ...interface{})
|
||||
Fatalf(format string, v ...interface{})
|
||||
|
||||
Panic(v ...interface{})
|
||||
Panicf(format string, v ...interface{})
|
||||
}
|
||||
|
||||
func SetLogger(l Logger) {
|
||||
raftLoggerMu.Lock()
|
||||
raftLogger = l
|
||||
raftLoggerMu.Unlock()
|
||||
}
|
||||
|
||||
var (
|
||||
defaultLogger = &DefaultLogger{Logger: log.New(os.Stderr, "raft", log.LstdFlags)}
|
||||
discardLogger = &DefaultLogger{Logger: log.New(ioutil.Discard, "", 0)}
|
||||
raftLoggerMu sync.Mutex
|
||||
raftLogger = Logger(defaultLogger)
|
||||
)
|
||||
|
||||
const (
|
||||
calldepth = 2
|
||||
)
|
||||
|
||||
// DefaultLogger is a default implementation of the Logger interface.
|
||||
type DefaultLogger struct {
|
||||
*log.Logger
|
||||
debug bool
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) EnableTimestamps() {
|
||||
l.SetFlags(l.Flags() | log.Ldate | log.Ltime)
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) EnableDebug() {
|
||||
l.debug = true
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Debug(v ...interface{}) {
|
||||
if l.debug {
|
||||
l.Output(calldepth, header("DEBUG", fmt.Sprint(v...)))
|
||||
}
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Debugf(format string, v ...interface{}) {
|
||||
if l.debug {
|
||||
l.Output(calldepth, header("DEBUG", fmt.Sprintf(format, v...)))
|
||||
}
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Info(v ...interface{}) {
|
||||
l.Output(calldepth, header("INFO", fmt.Sprint(v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Infof(format string, v ...interface{}) {
|
||||
l.Output(calldepth, header("INFO", fmt.Sprintf(format, v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Error(v ...interface{}) {
|
||||
l.Output(calldepth, header("ERROR", fmt.Sprint(v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Errorf(format string, v ...interface{}) {
|
||||
l.Output(calldepth, header("ERROR", fmt.Sprintf(format, v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Warning(v ...interface{}) {
|
||||
l.Output(calldepth, header("WARN", fmt.Sprint(v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Warningf(format string, v ...interface{}) {
|
||||
l.Output(calldepth, header("WARN", fmt.Sprintf(format, v...)))
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Fatal(v ...interface{}) {
|
||||
l.Output(calldepth, header("FATAL", fmt.Sprint(v...)))
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Fatalf(format string, v ...interface{}) {
|
||||
l.Output(calldepth, header("FATAL", fmt.Sprintf(format, v...)))
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Panic(v ...interface{}) {
|
||||
l.Logger.Panic(v...)
|
||||
}
|
||||
|
||||
func (l *DefaultLogger) Panicf(format string, v ...interface{}) {
|
||||
l.Logger.Panicf(format, v...)
|
||||
}
|
||||
|
||||
func header(lvl, msg string) string {
|
||||
return fmt.Sprintf("%s: %s", lvl, msg)
|
||||
}
|
584
vendor/go.etcd.io/etcd/raft/node.go
generated
vendored
Normal file
584
vendor/go.etcd.io/etcd/raft/node.go
generated
vendored
Normal file
@ -0,0 +1,584 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
type SnapshotStatus int
|
||||
|
||||
const (
|
||||
SnapshotFinish SnapshotStatus = 1
|
||||
SnapshotFailure SnapshotStatus = 2
|
||||
)
|
||||
|
||||
var (
|
||||
emptyState = pb.HardState{}
|
||||
|
||||
// ErrStopped is returned by methods on Nodes that have been stopped.
|
||||
ErrStopped = errors.New("raft: stopped")
|
||||
)
|
||||
|
||||
// SoftState provides state that is useful for logging and debugging.
|
||||
// The state is volatile and does not need to be persisted to the WAL.
|
||||
type SoftState struct {
|
||||
Lead uint64 // must use atomic operations to access; keep 64-bit aligned.
|
||||
RaftState StateType
|
||||
}
|
||||
|
||||
func (a *SoftState) equal(b *SoftState) bool {
|
||||
return a.Lead == b.Lead && a.RaftState == b.RaftState
|
||||
}
|
||||
|
||||
// Ready encapsulates the entries and messages that are ready to read,
|
||||
// be saved to stable storage, committed or sent to other peers.
|
||||
// All fields in Ready are read-only.
|
||||
type Ready struct {
|
||||
// The current volatile state of a Node.
|
||||
// SoftState will be nil if there is no update.
|
||||
// It is not required to consume or store SoftState.
|
||||
*SoftState
|
||||
|
||||
// The current state of a Node to be saved to stable storage BEFORE
|
||||
// Messages are sent.
|
||||
// HardState will be equal to empty state if there is no update.
|
||||
pb.HardState
|
||||
|
||||
// ReadStates can be used for node to serve linearizable read requests locally
|
||||
// when its applied index is greater than the index in ReadState.
|
||||
// Note that the readState will be returned when raft receives msgReadIndex.
|
||||
// The returned is only valid for the request that requested to read.
|
||||
ReadStates []ReadState
|
||||
|
||||
// Entries specifies entries to be saved to stable storage BEFORE
|
||||
// Messages are sent.
|
||||
Entries []pb.Entry
|
||||
|
||||
// Snapshot specifies the snapshot to be saved to stable storage.
|
||||
Snapshot pb.Snapshot
|
||||
|
||||
// CommittedEntries specifies entries to be committed to a
|
||||
// store/state-machine. These have previously been committed to stable
|
||||
// store.
|
||||
CommittedEntries []pb.Entry
|
||||
|
||||
// Messages specifies outbound messages to be sent AFTER Entries are
|
||||
// committed to stable storage.
|
||||
// If it contains a MsgSnap message, the application MUST report back to raft
|
||||
// when the snapshot has been received or has failed by calling ReportSnapshot.
|
||||
Messages []pb.Message
|
||||
|
||||
// MustSync indicates whether the HardState and Entries must be synchronously
|
||||
// written to disk or if an asynchronous write is permissible.
|
||||
MustSync bool
|
||||
}
|
||||
|
||||
func isHardStateEqual(a, b pb.HardState) bool {
|
||||
return a.Term == b.Term && a.Vote == b.Vote && a.Commit == b.Commit
|
||||
}
|
||||
|
||||
// IsEmptyHardState returns true if the given HardState is empty.
|
||||
func IsEmptyHardState(st pb.HardState) bool {
|
||||
return isHardStateEqual(st, emptyState)
|
||||
}
|
||||
|
||||
// IsEmptySnap returns true if the given Snapshot is empty.
|
||||
func IsEmptySnap(sp pb.Snapshot) bool {
|
||||
return sp.Metadata.Index == 0
|
||||
}
|
||||
|
||||
func (rd Ready) containsUpdates() bool {
|
||||
return rd.SoftState != nil || !IsEmptyHardState(rd.HardState) ||
|
||||
!IsEmptySnap(rd.Snapshot) || len(rd.Entries) > 0 ||
|
||||
len(rd.CommittedEntries) > 0 || len(rd.Messages) > 0 || len(rd.ReadStates) != 0
|
||||
}
|
||||
|
||||
// appliedCursor extracts from the Ready the highest index the client has
|
||||
// applied (once the Ready is confirmed via Advance). If no information is
|
||||
// contained in the Ready, returns zero.
|
||||
func (rd Ready) appliedCursor() uint64 {
|
||||
if n := len(rd.CommittedEntries); n > 0 {
|
||||
return rd.CommittedEntries[n-1].Index
|
||||
}
|
||||
if index := rd.Snapshot.Metadata.Index; index > 0 {
|
||||
return index
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Node represents a node in a raft cluster.
|
||||
type Node interface {
|
||||
// Tick increments the internal logical clock for the Node by a single tick. Election
|
||||
// timeouts and heartbeat timeouts are in units of ticks.
|
||||
Tick()
|
||||
// Campaign causes the Node to transition to candidate state and start campaigning to become leader.
|
||||
Campaign(ctx context.Context) error
|
||||
// Propose proposes that data be appended to the log. Note that proposals can be lost without
|
||||
// notice, therefore it is user's job to ensure proposal retries.
|
||||
Propose(ctx context.Context, data []byte) error
|
||||
// ProposeConfChange proposes a configuration change. Like any proposal, the
|
||||
// configuration change may be dropped with or without an error being
|
||||
// returned. In particular, configuration changes are dropped unless the
|
||||
// leader has certainty that there is no prior unapplied configuration
|
||||
// change in its log.
|
||||
//
|
||||
// The method accepts either a pb.ConfChange (deprecated) or pb.ConfChangeV2
|
||||
// message. The latter allows arbitrary configuration changes via joint
|
||||
// consensus, notably including replacing a voter. Passing a ConfChangeV2
|
||||
// message is only allowed if all Nodes participating in the cluster run a
|
||||
// version of this library aware of the V2 API. See pb.ConfChangeV2 for
|
||||
// usage details and semantics.
|
||||
ProposeConfChange(ctx context.Context, cc pb.ConfChangeI) error
|
||||
|
||||
// Step advances the state machine using the given message. ctx.Err() will be returned, if any.
|
||||
Step(ctx context.Context, msg pb.Message) error
|
||||
|
||||
// Ready returns a channel that returns the current point-in-time state.
|
||||
// Users of the Node must call Advance after retrieving the state returned by Ready.
|
||||
//
|
||||
// NOTE: No committed entries from the next Ready may be applied until all committed entries
|
||||
// and snapshots from the previous one have finished.
|
||||
Ready() <-chan Ready
|
||||
|
||||
// Advance notifies the Node that the application has saved progress up to the last Ready.
|
||||
// It prepares the node to return the next available Ready.
|
||||
//
|
||||
// The application should generally call Advance after it applies the entries in last Ready.
|
||||
//
|
||||
// However, as an optimization, the application may call Advance while it is applying the
|
||||
// commands. For example. when the last Ready contains a snapshot, the application might take
|
||||
// a long time to apply the snapshot data. To continue receiving Ready without blocking raft
|
||||
// progress, it can call Advance before finishing applying the last ready.
|
||||
Advance()
|
||||
// ApplyConfChange applies a config change (previously passed to
|
||||
// ProposeConfChange) to the node. This must be called whenever a config
|
||||
// change is observed in Ready.CommittedEntries.
|
||||
//
|
||||
// Returns an opaque non-nil ConfState protobuf which must be recorded in
|
||||
// snapshots.
|
||||
ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState
|
||||
|
||||
// TransferLeadership attempts to transfer leadership to the given transferee.
|
||||
TransferLeadership(ctx context.Context, lead, transferee uint64)
|
||||
|
||||
// ReadIndex request a read state. The read state will be set in the ready.
|
||||
// Read state has a read index. Once the application advances further than the read
|
||||
// index, any linearizable read requests issued before the read request can be
|
||||
// processed safely. The read state will have the same rctx attached.
|
||||
ReadIndex(ctx context.Context, rctx []byte) error
|
||||
|
||||
// Status returns the current status of the raft state machine.
|
||||
Status() Status
|
||||
// ReportUnreachable reports the given node is not reachable for the last send.
|
||||
ReportUnreachable(id uint64)
|
||||
// ReportSnapshot reports the status of the sent snapshot. The id is the raft ID of the follower
|
||||
// who is meant to receive the snapshot, and the status is SnapshotFinish or SnapshotFailure.
|
||||
// Calling ReportSnapshot with SnapshotFinish is a no-op. But, any failure in applying a
|
||||
// snapshot (for e.g., while streaming it from leader to follower), should be reported to the
|
||||
// leader with SnapshotFailure. When leader sends a snapshot to a follower, it pauses any raft
|
||||
// log probes until the follower can apply the snapshot and advance its state. If the follower
|
||||
// can't do that, for e.g., due to a crash, it could end up in a limbo, never getting any
|
||||
// updates from the leader. Therefore, it is crucial that the application ensures that any
|
||||
// failure in snapshot sending is caught and reported back to the leader; so it can resume raft
|
||||
// log probing in the follower.
|
||||
ReportSnapshot(id uint64, status SnapshotStatus)
|
||||
// Stop performs any necessary termination of the Node.
|
||||
Stop()
|
||||
}
|
||||
|
||||
type Peer struct {
|
||||
ID uint64
|
||||
Context []byte
|
||||
}
|
||||
|
||||
// StartNode returns a new Node given configuration and a list of raft peers.
|
||||
// It appends a ConfChangeAddNode entry for each given peer to the initial log.
|
||||
//
|
||||
// Peers must not be zero length; call RestartNode in that case.
|
||||
func StartNode(c *Config, peers []Peer) Node {
|
||||
if len(peers) == 0 {
|
||||
panic("no peers given; use RestartNode instead")
|
||||
}
|
||||
rn, err := NewRawNode(c)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
rn.Bootstrap(peers)
|
||||
|
||||
n := newNode(rn)
|
||||
|
||||
go n.run()
|
||||
return &n
|
||||
}
|
||||
|
||||
// RestartNode is similar to StartNode but does not take a list of peers.
|
||||
// The current membership of the cluster will be restored from the Storage.
|
||||
// If the caller has an existing state machine, pass in the last log index that
|
||||
// has been applied to it; otherwise use zero.
|
||||
func RestartNode(c *Config) Node {
|
||||
rn, err := NewRawNode(c)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
n := newNode(rn)
|
||||
go n.run()
|
||||
return &n
|
||||
}
|
||||
|
||||
type msgWithResult struct {
|
||||
m pb.Message
|
||||
result chan error
|
||||
}
|
||||
|
||||
// node is the canonical implementation of the Node interface
|
||||
type node struct {
|
||||
propc chan msgWithResult
|
||||
recvc chan pb.Message
|
||||
confc chan pb.ConfChangeV2
|
||||
confstatec chan pb.ConfState
|
||||
readyc chan Ready
|
||||
advancec chan struct{}
|
||||
tickc chan struct{}
|
||||
done chan struct{}
|
||||
stop chan struct{}
|
||||
status chan chan Status
|
||||
|
||||
rn *RawNode
|
||||
}
|
||||
|
||||
func newNode(rn *RawNode) node {
|
||||
return node{
|
||||
propc: make(chan msgWithResult),
|
||||
recvc: make(chan pb.Message),
|
||||
confc: make(chan pb.ConfChangeV2),
|
||||
confstatec: make(chan pb.ConfState),
|
||||
readyc: make(chan Ready),
|
||||
advancec: make(chan struct{}),
|
||||
// make tickc a buffered chan, so raft node can buffer some ticks when the node
|
||||
// is busy processing raft messages. Raft node will resume process buffered
|
||||
// ticks when it becomes idle.
|
||||
tickc: make(chan struct{}, 128),
|
||||
done: make(chan struct{}),
|
||||
stop: make(chan struct{}),
|
||||
status: make(chan chan Status),
|
||||
rn: rn,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) Stop() {
|
||||
select {
|
||||
case n.stop <- struct{}{}:
|
||||
// Not already stopped, so trigger it
|
||||
case <-n.done:
|
||||
// Node has already been stopped - no need to do anything
|
||||
return
|
||||
}
|
||||
// Block until the stop has been acknowledged by run()
|
||||
<-n.done
|
||||
}
|
||||
|
||||
func (n *node) run() {
|
||||
var propc chan msgWithResult
|
||||
var readyc chan Ready
|
||||
var advancec chan struct{}
|
||||
var rd Ready
|
||||
|
||||
r := n.rn.raft
|
||||
|
||||
lead := None
|
||||
|
||||
for {
|
||||
if advancec != nil {
|
||||
readyc = nil
|
||||
} else if n.rn.HasReady() {
|
||||
// Populate a Ready. Note that this Ready is not guaranteed to
|
||||
// actually be handled. We will arm readyc, but there's no guarantee
|
||||
// that we will actually send on it. It's possible that we will
|
||||
// service another channel instead, loop around, and then populate
|
||||
// the Ready again. We could instead force the previous Ready to be
|
||||
// handled first, but it's generally good to emit larger Readys plus
|
||||
// it simplifies testing (by emitting less frequently and more
|
||||
// predictably).
|
||||
rd = n.rn.readyWithoutAccept()
|
||||
readyc = n.readyc
|
||||
}
|
||||
|
||||
if lead != r.lead {
|
||||
if r.hasLeader() {
|
||||
if lead == None {
|
||||
r.logger.Infof("raft.node: %x elected leader %x at term %d", r.id, r.lead, r.Term)
|
||||
} else {
|
||||
r.logger.Infof("raft.node: %x changed leader from %x to %x at term %d", r.id, lead, r.lead, r.Term)
|
||||
}
|
||||
propc = n.propc
|
||||
} else {
|
||||
r.logger.Infof("raft.node: %x lost leader %x at term %d", r.id, lead, r.Term)
|
||||
propc = nil
|
||||
}
|
||||
lead = r.lead
|
||||
}
|
||||
|
||||
select {
|
||||
// TODO: maybe buffer the config propose if there exists one (the way
|
||||
// described in raft dissertation)
|
||||
// Currently it is dropped in Step silently.
|
||||
case pm := <-propc:
|
||||
m := pm.m
|
||||
m.From = r.id
|
||||
err := r.Step(m)
|
||||
if pm.result != nil {
|
||||
pm.result <- err
|
||||
close(pm.result)
|
||||
}
|
||||
case m := <-n.recvc:
|
||||
// filter out response message from unknown From.
|
||||
if pr := r.prs.Progress[m.From]; pr != nil || !IsResponseMsg(m.Type) {
|
||||
r.Step(m)
|
||||
}
|
||||
case cc := <-n.confc:
|
||||
_, okBefore := r.prs.Progress[r.id]
|
||||
cs := r.applyConfChange(cc)
|
||||
// If the node was removed, block incoming proposals. Note that we
|
||||
// only do this if the node was in the config before. Nodes may be
|
||||
// a member of the group without knowing this (when they're catching
|
||||
// up on the log and don't have the latest config) and we don't want
|
||||
// to block the proposal channel in that case.
|
||||
//
|
||||
// NB: propc is reset when the leader changes, which, if we learn
|
||||
// about it, sort of implies that we got readded, maybe? This isn't
|
||||
// very sound and likely has bugs.
|
||||
if _, okAfter := r.prs.Progress[r.id]; okBefore && !okAfter {
|
||||
var found bool
|
||||
for _, sl := range [][]uint64{cs.Voters, cs.VotersOutgoing} {
|
||||
for _, id := range sl {
|
||||
if id == r.id {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
propc = nil
|
||||
}
|
||||
}
|
||||
select {
|
||||
case n.confstatec <- cs:
|
||||
case <-n.done:
|
||||
}
|
||||
case <-n.tickc:
|
||||
n.rn.Tick()
|
||||
case readyc <- rd:
|
||||
n.rn.acceptReady(rd)
|
||||
advancec = n.advancec
|
||||
case <-advancec:
|
||||
n.rn.Advance(rd)
|
||||
rd = Ready{}
|
||||
advancec = nil
|
||||
case c := <-n.status:
|
||||
c <- getStatus(r)
|
||||
case <-n.stop:
|
||||
close(n.done)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tick increments the internal logical clock for this Node. Election timeouts
|
||||
// and heartbeat timeouts are in units of ticks.
|
||||
func (n *node) Tick() {
|
||||
select {
|
||||
case n.tickc <- struct{}{}:
|
||||
case <-n.done:
|
||||
default:
|
||||
n.rn.raft.logger.Warningf("%x (leader %v) A tick missed to fire. Node blocks too long!", n.rn.raft.id, n.rn.raft.id == n.rn.raft.lead)
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) Campaign(ctx context.Context) error { return n.step(ctx, pb.Message{Type: pb.MsgHup}) }
|
||||
|
||||
func (n *node) Propose(ctx context.Context, data []byte) error {
|
||||
return n.stepWait(ctx, pb.Message{Type: pb.MsgProp, Entries: []pb.Entry{{Data: data}}})
|
||||
}
|
||||
|
||||
func (n *node) Step(ctx context.Context, m pb.Message) error {
|
||||
// ignore unexpected local messages receiving over network
|
||||
if IsLocalMsg(m.Type) {
|
||||
// TODO: return an error?
|
||||
return nil
|
||||
}
|
||||
return n.step(ctx, m)
|
||||
}
|
||||
|
||||
func confChangeToMsg(c pb.ConfChangeI) (pb.Message, error) {
|
||||
typ, data, err := pb.MarshalConfChange(c)
|
||||
if err != nil {
|
||||
return pb.Message{}, err
|
||||
}
|
||||
return pb.Message{Type: pb.MsgProp, Entries: []pb.Entry{{Type: typ, Data: data}}}, nil
|
||||
}
|
||||
|
||||
func (n *node) ProposeConfChange(ctx context.Context, cc pb.ConfChangeI) error {
|
||||
msg, err := confChangeToMsg(cc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return n.Step(ctx, msg)
|
||||
}
|
||||
|
||||
func (n *node) step(ctx context.Context, m pb.Message) error {
|
||||
return n.stepWithWaitOption(ctx, m, false)
|
||||
}
|
||||
|
||||
func (n *node) stepWait(ctx context.Context, m pb.Message) error {
|
||||
return n.stepWithWaitOption(ctx, m, true)
|
||||
}
|
||||
|
||||
// Step advances the state machine using msgs. The ctx.Err() will be returned,
|
||||
// if any.
|
||||
func (n *node) stepWithWaitOption(ctx context.Context, m pb.Message, wait bool) error {
|
||||
if m.Type != pb.MsgProp {
|
||||
select {
|
||||
case n.recvc <- m:
|
||||
return nil
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-n.done:
|
||||
return ErrStopped
|
||||
}
|
||||
}
|
||||
ch := n.propc
|
||||
pm := msgWithResult{m: m}
|
||||
if wait {
|
||||
pm.result = make(chan error, 1)
|
||||
}
|
||||
select {
|
||||
case ch <- pm:
|
||||
if !wait {
|
||||
return nil
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-n.done:
|
||||
return ErrStopped
|
||||
}
|
||||
select {
|
||||
case err := <-pm.result:
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-n.done:
|
||||
return ErrStopped
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *node) Ready() <-chan Ready { return n.readyc }
|
||||
|
||||
func (n *node) Advance() {
|
||||
select {
|
||||
case n.advancec <- struct{}{}:
|
||||
case <-n.done:
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
|
||||
var cs pb.ConfState
|
||||
select {
|
||||
case n.confc <- cc.AsV2():
|
||||
case <-n.done:
|
||||
}
|
||||
select {
|
||||
case cs = <-n.confstatec:
|
||||
case <-n.done:
|
||||
}
|
||||
return &cs
|
||||
}
|
||||
|
||||
func (n *node) Status() Status {
|
||||
c := make(chan Status)
|
||||
select {
|
||||
case n.status <- c:
|
||||
return <-c
|
||||
case <-n.done:
|
||||
return Status{}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) ReportUnreachable(id uint64) {
|
||||
select {
|
||||
case n.recvc <- pb.Message{Type: pb.MsgUnreachable, From: id}:
|
||||
case <-n.done:
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) ReportSnapshot(id uint64, status SnapshotStatus) {
|
||||
rej := status == SnapshotFailure
|
||||
|
||||
select {
|
||||
case n.recvc <- pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej}:
|
||||
case <-n.done:
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) TransferLeadership(ctx context.Context, lead, transferee uint64) {
|
||||
select {
|
||||
// manually set 'from' and 'to', so that leader can voluntarily transfers its leadership
|
||||
case n.recvc <- pb.Message{Type: pb.MsgTransferLeader, From: transferee, To: lead}:
|
||||
case <-n.done:
|
||||
case <-ctx.Done():
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) ReadIndex(ctx context.Context, rctx []byte) error {
|
||||
return n.step(ctx, pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
|
||||
}
|
||||
|
||||
func newReady(r *raft, prevSoftSt *SoftState, prevHardSt pb.HardState) Ready {
|
||||
rd := Ready{
|
||||
Entries: r.raftLog.unstableEntries(),
|
||||
CommittedEntries: r.raftLog.nextEnts(),
|
||||
Messages: r.msgs,
|
||||
}
|
||||
if softSt := r.softState(); !softSt.equal(prevSoftSt) {
|
||||
rd.SoftState = softSt
|
||||
}
|
||||
if hardSt := r.hardState(); !isHardStateEqual(hardSt, prevHardSt) {
|
||||
rd.HardState = hardSt
|
||||
}
|
||||
if r.raftLog.unstable.snapshot != nil {
|
||||
rd.Snapshot = *r.raftLog.unstable.snapshot
|
||||
}
|
||||
if len(r.readStates) != 0 {
|
||||
rd.ReadStates = r.readStates
|
||||
}
|
||||
rd.MustSync = MustSync(r.hardState(), prevHardSt, len(rd.Entries))
|
||||
return rd
|
||||
}
|
||||
|
||||
// MustSync returns true if the hard state and count of Raft entries indicate
|
||||
// that a synchronous write to persistent storage is required.
|
||||
func MustSync(st, prevst pb.HardState, entsnum int) bool {
|
||||
// Persistent state on all servers:
|
||||
// (Updated on stable storage before responding to RPCs)
|
||||
// currentTerm
|
||||
// votedFor
|
||||
// log entries[]
|
||||
return entsnum != 0 || st.Vote != prevst.Vote || st.Term != prevst.Term
|
||||
}
|
75
vendor/go.etcd.io/etcd/raft/quorum/joint.go
generated
vendored
Normal file
75
vendor/go.etcd.io/etcd/raft/quorum/joint.go
generated
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package quorum
|
||||
|
||||
// JointConfig is a configuration of two groups of (possibly overlapping)
|
||||
// majority configurations. Decisions require the support of both majorities.
|
||||
type JointConfig [2]MajorityConfig
|
||||
|
||||
func (c JointConfig) String() string {
|
||||
if len(c[1]) > 0 {
|
||||
return c[0].String() + "&&" + c[1].String()
|
||||
}
|
||||
return c[0].String()
|
||||
}
|
||||
|
||||
// IDs returns a newly initialized map representing the set of voters present
|
||||
// in the joint configuration.
|
||||
func (c JointConfig) IDs() map[uint64]struct{} {
|
||||
m := map[uint64]struct{}{}
|
||||
for _, cc := range c {
|
||||
for id := range cc {
|
||||
m[id] = struct{}{}
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Describe returns a (multi-line) representation of the commit indexes for the
|
||||
// given lookuper.
|
||||
func (c JointConfig) Describe(l AckedIndexer) string {
|
||||
return MajorityConfig(c.IDs()).Describe(l)
|
||||
}
|
||||
|
||||
// CommittedIndex returns the largest committed index for the given joint
|
||||
// quorum. An index is jointly committed if it is committed in both constituent
|
||||
// majorities.
|
||||
func (c JointConfig) CommittedIndex(l AckedIndexer) Index {
|
||||
idx0 := c[0].CommittedIndex(l)
|
||||
idx1 := c[1].CommittedIndex(l)
|
||||
if idx0 < idx1 {
|
||||
return idx0
|
||||
}
|
||||
return idx1
|
||||
}
|
||||
|
||||
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||
// a result indicating whether the vote is pending, lost, or won. A joint quorum
|
||||
// requires both majority quorums to vote in favor.
|
||||
func (c JointConfig) VoteResult(votes map[uint64]bool) VoteResult {
|
||||
r1 := c[0].VoteResult(votes)
|
||||
r2 := c[1].VoteResult(votes)
|
||||
|
||||
if r1 == r2 {
|
||||
// If they agree, return the agreed state.
|
||||
return r1
|
||||
}
|
||||
if r1 == VoteLost || r2 == VoteLost {
|
||||
// If either config has lost, loss is the only possible outcome.
|
||||
return VoteLost
|
||||
}
|
||||
// One side won, the other one is pending, so the whole outcome is.
|
||||
return VotePending
|
||||
}
|
210
vendor/go.etcd.io/etcd/raft/quorum/majority.go
generated
vendored
Normal file
210
vendor/go.etcd.io/etcd/raft/quorum/majority.go
generated
vendored
Normal file
@ -0,0 +1,210 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package quorum
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// MajorityConfig is a set of IDs that uses majority quorums to make decisions.
|
||||
type MajorityConfig map[uint64]struct{}
|
||||
|
||||
func (c MajorityConfig) String() string {
|
||||
sl := make([]uint64, 0, len(c))
|
||||
for id := range c {
|
||||
sl = append(sl, id)
|
||||
}
|
||||
sort.Slice(sl, func(i, j int) bool { return sl[i] < sl[j] })
|
||||
var buf strings.Builder
|
||||
buf.WriteByte('(')
|
||||
for i := range sl {
|
||||
if i > 0 {
|
||||
buf.WriteByte(' ')
|
||||
}
|
||||
fmt.Fprint(&buf, sl[i])
|
||||
}
|
||||
buf.WriteByte(')')
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Describe returns a (multi-line) representation of the commit indexes for the
|
||||
// given lookuper.
|
||||
func (c MajorityConfig) Describe(l AckedIndexer) string {
|
||||
if len(c) == 0 {
|
||||
return "<empty majority quorum>"
|
||||
}
|
||||
type tup struct {
|
||||
id uint64
|
||||
idx Index
|
||||
ok bool // idx found?
|
||||
bar int // length of bar displayed for this tup
|
||||
}
|
||||
|
||||
// Below, populate .bar so that the i-th largest commit index has bar i (we
|
||||
// plot this as sort of a progress bar). The actual code is a bit more
|
||||
// complicated and also makes sure that equal index => equal bar.
|
||||
|
||||
n := len(c)
|
||||
info := make([]tup, 0, n)
|
||||
for id := range c {
|
||||
idx, ok := l.AckedIndex(id)
|
||||
info = append(info, tup{id: id, idx: idx, ok: ok})
|
||||
}
|
||||
|
||||
// Sort by index
|
||||
sort.Slice(info, func(i, j int) bool {
|
||||
if info[i].idx == info[j].idx {
|
||||
return info[i].id < info[j].id
|
||||
}
|
||||
return info[i].idx < info[j].idx
|
||||
})
|
||||
|
||||
// Populate .bar.
|
||||
for i := range info {
|
||||
if i > 0 && info[i-1].idx < info[i].idx {
|
||||
info[i].bar = i
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by ID.
|
||||
sort.Slice(info, func(i, j int) bool {
|
||||
return info[i].id < info[j].id
|
||||
})
|
||||
|
||||
var buf strings.Builder
|
||||
|
||||
// Print.
|
||||
fmt.Fprint(&buf, strings.Repeat(" ", n)+" idx\n")
|
||||
for i := range info {
|
||||
bar := info[i].bar
|
||||
if !info[i].ok {
|
||||
fmt.Fprint(&buf, "?"+strings.Repeat(" ", n))
|
||||
} else {
|
||||
fmt.Fprint(&buf, strings.Repeat("x", bar)+">"+strings.Repeat(" ", n-bar))
|
||||
}
|
||||
fmt.Fprintf(&buf, " %5d (id=%d)\n", info[i].idx, info[i].id)
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Slice returns the MajorityConfig as a sorted slice.
|
||||
func (c MajorityConfig) Slice() []uint64 {
|
||||
var sl []uint64
|
||||
for id := range c {
|
||||
sl = append(sl, id)
|
||||
}
|
||||
sort.Slice(sl, func(i, j int) bool { return sl[i] < sl[j] })
|
||||
return sl
|
||||
}
|
||||
|
||||
func insertionSort(sl []uint64) {
|
||||
a, b := 0, len(sl)
|
||||
for i := a + 1; i < b; i++ {
|
||||
for j := i; j > a && sl[j] < sl[j-1]; j-- {
|
||||
sl[j], sl[j-1] = sl[j-1], sl[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CommittedIndex computes the committed index from those supplied via the
|
||||
// provided AckedIndexer (for the active config).
|
||||
func (c MajorityConfig) CommittedIndex(l AckedIndexer) Index {
|
||||
n := len(c)
|
||||
if n == 0 {
|
||||
// This plays well with joint quorums which, when one half is the zero
|
||||
// MajorityConfig, should behave like the other half.
|
||||
return math.MaxUint64
|
||||
}
|
||||
|
||||
// Use an on-stack slice to collect the committed indexes when n <= 7
|
||||
// (otherwise we alloc). The alternative is to stash a slice on
|
||||
// MajorityConfig, but this impairs usability (as is, MajorityConfig is just
|
||||
// a map, and that's nice). The assumption is that running with a
|
||||
// replication factor of >7 is rare, and in cases in which it happens
|
||||
// performance is a lesser concern (additionally the performance
|
||||
// implications of an allocation here are far from drastic).
|
||||
var stk [7]uint64
|
||||
var srt []uint64
|
||||
if len(stk) >= n {
|
||||
srt = stk[:n]
|
||||
} else {
|
||||
srt = make([]uint64, n)
|
||||
}
|
||||
|
||||
{
|
||||
// Fill the slice with the indexes observed. Any unused slots will be
|
||||
// left as zero; these correspond to voters that may report in, but
|
||||
// haven't yet. We fill from the right (since the zeroes will end up on
|
||||
// the left after sorting below anyway).
|
||||
i := n - 1
|
||||
for id := range c {
|
||||
if idx, ok := l.AckedIndex(id); ok {
|
||||
srt[i] = uint64(idx)
|
||||
i--
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by index. Use a bespoke algorithm (copied from the stdlib's sort
|
||||
// package) to keep srt on the stack.
|
||||
insertionSort(srt)
|
||||
|
||||
// The smallest index into the array for which the value is acked by a
|
||||
// quorum. In other words, from the end of the slice, move n/2+1 to the
|
||||
// left (accounting for zero-indexing).
|
||||
pos := n - (n/2 + 1)
|
||||
return Index(srt[pos])
|
||||
}
|
||||
|
||||
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
|
||||
// a result indicating whether the vote is pending (i.e. neither a quorum of
|
||||
// yes/no has been reached), won (a quorum of yes has been reached), or lost (a
|
||||
// quorum of no has been reached).
|
||||
func (c MajorityConfig) VoteResult(votes map[uint64]bool) VoteResult {
|
||||
if len(c) == 0 {
|
||||
// By convention, the elections on an empty config win. This comes in
|
||||
// handy with joint quorums because it'll make a half-populated joint
|
||||
// quorum behave like a majority quorum.
|
||||
return VoteWon
|
||||
}
|
||||
|
||||
ny := [2]int{} // vote counts for no and yes, respectively
|
||||
|
||||
var missing int
|
||||
for id := range c {
|
||||
v, ok := votes[id]
|
||||
if !ok {
|
||||
missing++
|
||||
continue
|
||||
}
|
||||
if v {
|
||||
ny[1]++
|
||||
} else {
|
||||
ny[0]++
|
||||
}
|
||||
}
|
||||
|
||||
q := len(c)/2 + 1
|
||||
if ny[1] >= q {
|
||||
return VoteWon
|
||||
}
|
||||
if ny[1]+missing >= q {
|
||||
return VotePending
|
||||
}
|
||||
return VoteLost
|
||||
}
|
58
vendor/go.etcd.io/etcd/raft/quorum/quorum.go
generated
vendored
Normal file
58
vendor/go.etcd.io/etcd/raft/quorum/quorum.go
generated
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package quorum
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Index is a Raft log position.
|
||||
type Index uint64
|
||||
|
||||
func (i Index) String() string {
|
||||
if i == math.MaxUint64 {
|
||||
return "∞"
|
||||
}
|
||||
return strconv.FormatUint(uint64(i), 10)
|
||||
}
|
||||
|
||||
// AckedIndexer allows looking up a commit index for a given ID of a voter
|
||||
// from a corresponding MajorityConfig.
|
||||
type AckedIndexer interface {
|
||||
AckedIndex(voterID uint64) (idx Index, found bool)
|
||||
}
|
||||
|
||||
type mapAckIndexer map[uint64]Index
|
||||
|
||||
func (m mapAckIndexer) AckedIndex(id uint64) (Index, bool) {
|
||||
idx, ok := m[id]
|
||||
return idx, ok
|
||||
}
|
||||
|
||||
// VoteResult indicates the outcome of a vote.
|
||||
//
|
||||
//go:generate stringer -type=VoteResult
|
||||
type VoteResult uint8
|
||||
|
||||
const (
|
||||
// VotePending indicates that the decision of the vote depends on future
|
||||
// votes, i.e. neither "yes" or "no" has reached quorum yet.
|
||||
VotePending VoteResult = 1 + iota
|
||||
// VoteLost indicates that the quorum has voted "no".
|
||||
VoteLost
|
||||
// VoteWon indicates that the quorum has voted "yes".
|
||||
VoteWon
|
||||
)
|
26
vendor/go.etcd.io/etcd/raft/quorum/voteresult_string.go
generated
vendored
Normal file
26
vendor/go.etcd.io/etcd/raft/quorum/voteresult_string.go
generated
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
// Code generated by "stringer -type=VoteResult"; DO NOT EDIT.
|
||||
|
||||
package quorum
|
||||
|
||||
import "strconv"
|
||||
|
||||
func _() {
|
||||
// An "invalid array index" compiler error signifies that the constant values have changed.
|
||||
// Re-run the stringer command to generate them again.
|
||||
var x [1]struct{}
|
||||
_ = x[VotePending-1]
|
||||
_ = x[VoteLost-2]
|
||||
_ = x[VoteWon-3]
|
||||
}
|
||||
|
||||
const _VoteResult_name = "VotePendingVoteLostVoteWon"
|
||||
|
||||
var _VoteResult_index = [...]uint8{0, 11, 19, 26}
|
||||
|
||||
func (i VoteResult) String() string {
|
||||
i -= 1
|
||||
if i >= VoteResult(len(_VoteResult_index)-1) {
|
||||
return "VoteResult(" + strconv.FormatInt(int64(i+1), 10) + ")"
|
||||
}
|
||||
return _VoteResult_name[_VoteResult_index[i]:_VoteResult_index[i+1]]
|
||||
}
|
1656
vendor/go.etcd.io/etcd/raft/raft.go
generated
vendored
Normal file
1656
vendor/go.etcd.io/etcd/raft/raft.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
170
vendor/go.etcd.io/etcd/raft/raftpb/confchange.go
generated
vendored
Normal file
170
vendor/go.etcd.io/etcd/raft/raftpb/confchange.go
generated
vendored
Normal file
@ -0,0 +1,170 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raftpb
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/gogo/protobuf/proto"
|
||||
)
|
||||
|
||||
// ConfChangeI abstracts over ConfChangeV2 and (legacy) ConfChange to allow
|
||||
// treating them in a unified manner.
|
||||
type ConfChangeI interface {
|
||||
AsV2() ConfChangeV2
|
||||
AsV1() (ConfChange, bool)
|
||||
}
|
||||
|
||||
// MarshalConfChange calls Marshal on the underlying ConfChange or ConfChangeV2
|
||||
// and returns the result along with the corresponding EntryType.
|
||||
func MarshalConfChange(c ConfChangeI) (EntryType, []byte, error) {
|
||||
var typ EntryType
|
||||
var ccdata []byte
|
||||
var err error
|
||||
if ccv1, ok := c.AsV1(); ok {
|
||||
typ = EntryConfChange
|
||||
ccdata, err = ccv1.Marshal()
|
||||
} else {
|
||||
ccv2 := c.AsV2()
|
||||
typ = EntryConfChangeV2
|
||||
ccdata, err = ccv2.Marshal()
|
||||
}
|
||||
return typ, ccdata, err
|
||||
}
|
||||
|
||||
// AsV2 returns a V2 configuration change carrying out the same operation.
|
||||
func (c ConfChange) AsV2() ConfChangeV2 {
|
||||
return ConfChangeV2{
|
||||
Changes: []ConfChangeSingle{{
|
||||
Type: c.Type,
|
||||
NodeID: c.NodeID,
|
||||
}},
|
||||
Context: c.Context,
|
||||
}
|
||||
}
|
||||
|
||||
// AsV1 returns the ConfChange and true.
|
||||
func (c ConfChange) AsV1() (ConfChange, bool) {
|
||||
return c, true
|
||||
}
|
||||
|
||||
// AsV2 is the identity.
|
||||
func (c ConfChangeV2) AsV2() ConfChangeV2 { return c }
|
||||
|
||||
// AsV1 returns ConfChange{} and false.
|
||||
func (c ConfChangeV2) AsV1() (ConfChange, bool) { return ConfChange{}, false }
|
||||
|
||||
// EnterJoint returns two bools. The second bool is true if and only if this
|
||||
// config change will use Joint Consensus, which is the case if it contains more
|
||||
// than one change or if the use of Joint Consensus was requested explicitly.
|
||||
// The first bool can only be true if second one is, and indicates whether the
|
||||
// Joint State will be left automatically.
|
||||
func (c *ConfChangeV2) EnterJoint() (autoLeave bool, ok bool) {
|
||||
// NB: in theory, more config changes could qualify for the "simple"
|
||||
// protocol but it depends on the config on top of which the changes apply.
|
||||
// For example, adding two learners is not OK if both nodes are part of the
|
||||
// base config (i.e. two voters are turned into learners in the process of
|
||||
// applying the conf change). In practice, these distinctions should not
|
||||
// matter, so we keep it simple and use Joint Consensus liberally.
|
||||
if c.Transition != ConfChangeTransitionAuto || len(c.Changes) > 1 {
|
||||
// Use Joint Consensus.
|
||||
var autoLeave bool
|
||||
switch c.Transition {
|
||||
case ConfChangeTransitionAuto:
|
||||
autoLeave = true
|
||||
case ConfChangeTransitionJointImplicit:
|
||||
autoLeave = true
|
||||
case ConfChangeTransitionJointExplicit:
|
||||
default:
|
||||
panic(fmt.Sprintf("unknown transition: %+v", c))
|
||||
}
|
||||
return autoLeave, true
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
||||
// LeaveJoint is true if the configuration change leaves a joint configuration.
|
||||
// This is the case if the ConfChangeV2 is zero, with the possible exception of
|
||||
// the Context field.
|
||||
func (c *ConfChangeV2) LeaveJoint() bool {
|
||||
cpy := *c
|
||||
cpy.Context = nil
|
||||
return proto.Equal(&cpy, &ConfChangeV2{})
|
||||
}
|
||||
|
||||
// ConfChangesFromString parses a Space-delimited sequence of operations into a
|
||||
// slice of ConfChangeSingle. The supported operations are:
|
||||
// - vn: make n a voter,
|
||||
// - ln: make n a learner,
|
||||
// - rn: remove n, and
|
||||
// - un: update n.
|
||||
func ConfChangesFromString(s string) ([]ConfChangeSingle, error) {
|
||||
var ccs []ConfChangeSingle
|
||||
toks := strings.Split(strings.TrimSpace(s), " ")
|
||||
if toks[0] == "" {
|
||||
toks = nil
|
||||
}
|
||||
for _, tok := range toks {
|
||||
if len(tok) < 2 {
|
||||
return nil, fmt.Errorf("unknown token %s", tok)
|
||||
}
|
||||
var cc ConfChangeSingle
|
||||
switch tok[0] {
|
||||
case 'v':
|
||||
cc.Type = ConfChangeAddNode
|
||||
case 'l':
|
||||
cc.Type = ConfChangeAddLearnerNode
|
||||
case 'r':
|
||||
cc.Type = ConfChangeRemoveNode
|
||||
case 'u':
|
||||
cc.Type = ConfChangeUpdateNode
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown input: %s", tok)
|
||||
}
|
||||
id, err := strconv.ParseUint(tok[1:], 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cc.NodeID = id
|
||||
ccs = append(ccs, cc)
|
||||
}
|
||||
return ccs, nil
|
||||
}
|
||||
|
||||
// ConfChangesToString is the inverse to ConfChangesFromString.
|
||||
func ConfChangesToString(ccs []ConfChangeSingle) string {
|
||||
var buf strings.Builder
|
||||
for i, cc := range ccs {
|
||||
if i > 0 {
|
||||
buf.WriteByte(' ')
|
||||
}
|
||||
switch cc.Type {
|
||||
case ConfChangeAddNode:
|
||||
buf.WriteByte('v')
|
||||
case ConfChangeAddLearnerNode:
|
||||
buf.WriteByte('l')
|
||||
case ConfChangeRemoveNode:
|
||||
buf.WriteByte('r')
|
||||
case ConfChangeUpdateNode:
|
||||
buf.WriteByte('u')
|
||||
default:
|
||||
buf.WriteString("unknown")
|
||||
}
|
||||
fmt.Fprintf(&buf, "%d", cc.NodeID)
|
||||
}
|
||||
return buf.String()
|
||||
}
|
45
vendor/go.etcd.io/etcd/raft/raftpb/confstate.go
generated
vendored
Normal file
45
vendor/go.etcd.io/etcd/raft/raftpb/confstate.go
generated
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raftpb
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Equivalent returns a nil error if the inputs describe the same configuration.
|
||||
// On mismatch, returns a descriptive error showing the differences.
|
||||
func (cs ConfState) Equivalent(cs2 ConfState) error {
|
||||
cs1 := cs
|
||||
orig1, orig2 := cs1, cs2
|
||||
s := func(sl *[]uint64) {
|
||||
*sl = append([]uint64(nil), *sl...)
|
||||
sort.Slice(*sl, func(i, j int) bool { return (*sl)[i] < (*sl)[j] })
|
||||
}
|
||||
|
||||
for _, cs := range []*ConfState{&cs1, &cs2} {
|
||||
s(&cs.Voters)
|
||||
s(&cs.Learners)
|
||||
s(&cs.VotersOutgoing)
|
||||
s(&cs.LearnersNext)
|
||||
cs.XXX_unrecognized = nil
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(cs1, cs2) {
|
||||
return fmt.Errorf("ConfStates not equivalent after sorting:\n%+#v\n%+#v\nInputs were:\n%+#v\n%+#v", cs1, cs2, orig1, orig2)
|
||||
}
|
||||
return nil
|
||||
}
|
2646
vendor/go.etcd.io/etcd/raft/raftpb/raft.pb.go
generated
vendored
Normal file
2646
vendor/go.etcd.io/etcd/raft/raftpb/raft.pb.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
239
vendor/go.etcd.io/etcd/raft/rawnode.go
generated
vendored
Normal file
239
vendor/go.etcd.io/etcd/raft/rawnode.go
generated
vendored
Normal file
@ -0,0 +1,239 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
"go.etcd.io/etcd/raft/tracker"
|
||||
)
|
||||
|
||||
// ErrStepLocalMsg is returned when try to step a local raft message
|
||||
var ErrStepLocalMsg = errors.New("raft: cannot step raft local message")
|
||||
|
||||
// ErrStepPeerNotFound is returned when try to step a response message
|
||||
// but there is no peer found in raft.prs for that node.
|
||||
var ErrStepPeerNotFound = errors.New("raft: cannot step as peer not found")
|
||||
|
||||
// RawNode is a thread-unsafe Node.
|
||||
// The methods of this struct correspond to the methods of Node and are described
|
||||
// more fully there.
|
||||
type RawNode struct {
|
||||
raft *raft
|
||||
prevSoftSt *SoftState
|
||||
prevHardSt pb.HardState
|
||||
}
|
||||
|
||||
// NewRawNode instantiates a RawNode from the given configuration.
|
||||
//
|
||||
// See Bootstrap() for bootstrapping an initial state; this replaces the former
|
||||
// 'peers' argument to this method (with identical behavior). However, It is
|
||||
// recommended that instead of calling Bootstrap, applications bootstrap their
|
||||
// state manually by setting up a Storage that has a first index > 1 and which
|
||||
// stores the desired ConfState as its InitialState.
|
||||
func NewRawNode(config *Config) (*RawNode, error) {
|
||||
r := newRaft(config)
|
||||
rn := &RawNode{
|
||||
raft: r,
|
||||
}
|
||||
rn.prevSoftSt = r.softState()
|
||||
rn.prevHardSt = r.hardState()
|
||||
return rn, nil
|
||||
}
|
||||
|
||||
// Tick advances the internal logical clock by a single tick.
|
||||
func (rn *RawNode) Tick() {
|
||||
rn.raft.tick()
|
||||
}
|
||||
|
||||
// TickQuiesced advances the internal logical clock by a single tick without
|
||||
// performing any other state machine processing. It allows the caller to avoid
|
||||
// periodic heartbeats and elections when all of the peers in a Raft group are
|
||||
// known to be at the same state. Expected usage is to periodically invoke Tick
|
||||
// or TickQuiesced depending on whether the group is "active" or "quiesced".
|
||||
//
|
||||
// WARNING: Be very careful about using this method as it subverts the Raft
|
||||
// state machine. You should probably be using Tick instead.
|
||||
func (rn *RawNode) TickQuiesced() {
|
||||
rn.raft.electionElapsed++
|
||||
}
|
||||
|
||||
// Campaign causes this RawNode to transition to candidate state.
|
||||
func (rn *RawNode) Campaign() error {
|
||||
return rn.raft.Step(pb.Message{
|
||||
Type: pb.MsgHup,
|
||||
})
|
||||
}
|
||||
|
||||
// Propose proposes data be appended to the raft log.
|
||||
func (rn *RawNode) Propose(data []byte) error {
|
||||
return rn.raft.Step(pb.Message{
|
||||
Type: pb.MsgProp,
|
||||
From: rn.raft.id,
|
||||
Entries: []pb.Entry{
|
||||
{Data: data},
|
||||
}})
|
||||
}
|
||||
|
||||
// ProposeConfChange proposes a config change. See (Node).ProposeConfChange for
|
||||
// details.
|
||||
func (rn *RawNode) ProposeConfChange(cc pb.ConfChangeI) error {
|
||||
m, err := confChangeToMsg(cc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return rn.raft.Step(m)
|
||||
}
|
||||
|
||||
// ApplyConfChange applies a config change to the local node.
|
||||
func (rn *RawNode) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
|
||||
cs := rn.raft.applyConfChange(cc.AsV2())
|
||||
return &cs
|
||||
}
|
||||
|
||||
// Step advances the state machine using the given message.
|
||||
func (rn *RawNode) Step(m pb.Message) error {
|
||||
// ignore unexpected local messages receiving over network
|
||||
if IsLocalMsg(m.Type) {
|
||||
return ErrStepLocalMsg
|
||||
}
|
||||
if pr := rn.raft.prs.Progress[m.From]; pr != nil || !IsResponseMsg(m.Type) {
|
||||
return rn.raft.Step(m)
|
||||
}
|
||||
return ErrStepPeerNotFound
|
||||
}
|
||||
|
||||
// Ready returns the outstanding work that the application needs to handle. This
|
||||
// includes appending and applying entries or a snapshot, updating the HardState,
|
||||
// and sending messages. The returned Ready() *must* be handled and subsequently
|
||||
// passed back via Advance().
|
||||
func (rn *RawNode) Ready() Ready {
|
||||
rd := rn.readyWithoutAccept()
|
||||
rn.acceptReady(rd)
|
||||
return rd
|
||||
}
|
||||
|
||||
// readyWithoutAccept returns a Ready. This is a read-only operation, i.e. there
|
||||
// is no obligation that the Ready must be handled.
|
||||
func (rn *RawNode) readyWithoutAccept() Ready {
|
||||
return newReady(rn.raft, rn.prevSoftSt, rn.prevHardSt)
|
||||
}
|
||||
|
||||
// acceptReady is called when the consumer of the RawNode has decided to go
|
||||
// ahead and handle a Ready. Nothing must alter the state of the RawNode between
|
||||
// this call and the prior call to Ready().
|
||||
func (rn *RawNode) acceptReady(rd Ready) {
|
||||
if rd.SoftState != nil {
|
||||
rn.prevSoftSt = rd.SoftState
|
||||
}
|
||||
if len(rd.ReadStates) != 0 {
|
||||
rn.raft.readStates = nil
|
||||
}
|
||||
rn.raft.msgs = nil
|
||||
}
|
||||
|
||||
// HasReady called when RawNode user need to check if any Ready pending.
|
||||
// Checking logic in this method should be consistent with Ready.containsUpdates().
|
||||
func (rn *RawNode) HasReady() bool {
|
||||
r := rn.raft
|
||||
if !r.softState().equal(rn.prevSoftSt) {
|
||||
return true
|
||||
}
|
||||
if hardSt := r.hardState(); !IsEmptyHardState(hardSt) && !isHardStateEqual(hardSt, rn.prevHardSt) {
|
||||
return true
|
||||
}
|
||||
if r.raftLog.unstable.snapshot != nil && !IsEmptySnap(*r.raftLog.unstable.snapshot) {
|
||||
return true
|
||||
}
|
||||
if len(r.msgs) > 0 || len(r.raftLog.unstableEntries()) > 0 || r.raftLog.hasNextEnts() {
|
||||
return true
|
||||
}
|
||||
if len(r.readStates) != 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Advance notifies the RawNode that the application has applied and saved progress in the
|
||||
// last Ready results.
|
||||
func (rn *RawNode) Advance(rd Ready) {
|
||||
if !IsEmptyHardState(rd.HardState) {
|
||||
rn.prevHardSt = rd.HardState
|
||||
}
|
||||
rn.raft.advance(rd)
|
||||
}
|
||||
|
||||
// Status returns the current status of the given group. This allocates, see
|
||||
// BasicStatus and WithProgress for allocation-friendlier choices.
|
||||
func (rn *RawNode) Status() Status {
|
||||
status := getStatus(rn.raft)
|
||||
return status
|
||||
}
|
||||
|
||||
// BasicStatus returns a BasicStatus. Notably this does not contain the
|
||||
// Progress map; see WithProgress for an allocation-free way to inspect it.
|
||||
func (rn *RawNode) BasicStatus() BasicStatus {
|
||||
return getBasicStatus(rn.raft)
|
||||
}
|
||||
|
||||
// ProgressType indicates the type of replica a Progress corresponds to.
|
||||
type ProgressType byte
|
||||
|
||||
const (
|
||||
// ProgressTypePeer accompanies a Progress for a regular peer replica.
|
||||
ProgressTypePeer ProgressType = iota
|
||||
// ProgressTypeLearner accompanies a Progress for a learner replica.
|
||||
ProgressTypeLearner
|
||||
)
|
||||
|
||||
// WithProgress is a helper to introspect the Progress for this node and its
|
||||
// peers.
|
||||
func (rn *RawNode) WithProgress(visitor func(id uint64, typ ProgressType, pr tracker.Progress)) {
|
||||
rn.raft.prs.Visit(func(id uint64, pr *tracker.Progress) {
|
||||
typ := ProgressTypePeer
|
||||
if pr.IsLearner {
|
||||
typ = ProgressTypeLearner
|
||||
}
|
||||
p := *pr
|
||||
p.Inflights = nil
|
||||
visitor(id, typ, p)
|
||||
})
|
||||
}
|
||||
|
||||
// ReportUnreachable reports the given node is not reachable for the last send.
|
||||
func (rn *RawNode) ReportUnreachable(id uint64) {
|
||||
_ = rn.raft.Step(pb.Message{Type: pb.MsgUnreachable, From: id})
|
||||
}
|
||||
|
||||
// ReportSnapshot reports the status of the sent snapshot.
|
||||
func (rn *RawNode) ReportSnapshot(id uint64, status SnapshotStatus) {
|
||||
rej := status == SnapshotFailure
|
||||
|
||||
_ = rn.raft.Step(pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej})
|
||||
}
|
||||
|
||||
// TransferLeader tries to transfer leadership to the given transferee.
|
||||
func (rn *RawNode) TransferLeader(transferee uint64) {
|
||||
_ = rn.raft.Step(pb.Message{Type: pb.MsgTransferLeader, From: transferee})
|
||||
}
|
||||
|
||||
// ReadIndex requests a read state. The read state will be set in ready.
|
||||
// Read State has a read index. Once the application advances further than the read
|
||||
// index, any linearizable read requests issued before the read request can be
|
||||
// processed safely. The read state will have the same rctx attached.
|
||||
func (rn *RawNode) ReadIndex(rctx []byte) {
|
||||
_ = rn.raft.Step(pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
|
||||
}
|
121
vendor/go.etcd.io/etcd/raft/read_only.go
generated
vendored
Normal file
121
vendor/go.etcd.io/etcd/raft/read_only.go
generated
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
// Copyright 2016 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import pb "go.etcd.io/etcd/raft/raftpb"
|
||||
|
||||
// ReadState provides state for read only query.
|
||||
// It's caller's responsibility to call ReadIndex first before getting
|
||||
// this state from ready, it's also caller's duty to differentiate if this
|
||||
// state is what it requests through RequestCtx, eg. given a unique id as
|
||||
// RequestCtx
|
||||
type ReadState struct {
|
||||
Index uint64
|
||||
RequestCtx []byte
|
||||
}
|
||||
|
||||
type readIndexStatus struct {
|
||||
req pb.Message
|
||||
index uint64
|
||||
// NB: this never records 'false', but it's more convenient to use this
|
||||
// instead of a map[uint64]struct{} due to the API of quorum.VoteResult. If
|
||||
// this becomes performance sensitive enough (doubtful), quorum.VoteResult
|
||||
// can change to an API that is closer to that of CommittedIndex.
|
||||
acks map[uint64]bool
|
||||
}
|
||||
|
||||
type readOnly struct {
|
||||
option ReadOnlyOption
|
||||
pendingReadIndex map[string]*readIndexStatus
|
||||
readIndexQueue []string
|
||||
}
|
||||
|
||||
func newReadOnly(option ReadOnlyOption) *readOnly {
|
||||
return &readOnly{
|
||||
option: option,
|
||||
pendingReadIndex: make(map[string]*readIndexStatus),
|
||||
}
|
||||
}
|
||||
|
||||
// addRequest adds a read only reuqest into readonly struct.
|
||||
// `index` is the commit index of the raft state machine when it received
|
||||
// the read only request.
|
||||
// `m` is the original read only request message from the local or remote node.
|
||||
func (ro *readOnly) addRequest(index uint64, m pb.Message) {
|
||||
s := string(m.Entries[0].Data)
|
||||
if _, ok := ro.pendingReadIndex[s]; ok {
|
||||
return
|
||||
}
|
||||
ro.pendingReadIndex[s] = &readIndexStatus{index: index, req: m, acks: make(map[uint64]bool)}
|
||||
ro.readIndexQueue = append(ro.readIndexQueue, s)
|
||||
}
|
||||
|
||||
// recvAck notifies the readonly struct that the raft state machine received
|
||||
// an acknowledgment of the heartbeat that attached with the read only request
|
||||
// context.
|
||||
func (ro *readOnly) recvAck(id uint64, context []byte) map[uint64]bool {
|
||||
rs, ok := ro.pendingReadIndex[string(context)]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
rs.acks[id] = true
|
||||
return rs.acks
|
||||
}
|
||||
|
||||
// advance advances the read only request queue kept by the readonly struct.
|
||||
// It dequeues the requests until it finds the read only request that has
|
||||
// the same context as the given `m`.
|
||||
func (ro *readOnly) advance(m pb.Message) []*readIndexStatus {
|
||||
var (
|
||||
i int
|
||||
found bool
|
||||
)
|
||||
|
||||
ctx := string(m.Context)
|
||||
rss := []*readIndexStatus{}
|
||||
|
||||
for _, okctx := range ro.readIndexQueue {
|
||||
i++
|
||||
rs, ok := ro.pendingReadIndex[okctx]
|
||||
if !ok {
|
||||
panic("cannot find corresponding read state from pending map")
|
||||
}
|
||||
rss = append(rss, rs)
|
||||
if okctx == ctx {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
ro.readIndexQueue = ro.readIndexQueue[i:]
|
||||
for _, rs := range rss {
|
||||
delete(ro.pendingReadIndex, string(rs.req.Entries[0].Data))
|
||||
}
|
||||
return rss
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// lastPendingRequestCtx returns the context of the last pending read only
|
||||
// request in readonly struct.
|
||||
func (ro *readOnly) lastPendingRequestCtx() string {
|
||||
if len(ro.readIndexQueue) == 0 {
|
||||
return ""
|
||||
}
|
||||
return ro.readIndexQueue[len(ro.readIndexQueue)-1]
|
||||
}
|
106
vendor/go.etcd.io/etcd/raft/status.go
generated
vendored
Normal file
106
vendor/go.etcd.io/etcd/raft/status.go
generated
vendored
Normal file
@ -0,0 +1,106 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
"go.etcd.io/etcd/raft/tracker"
|
||||
)
|
||||
|
||||
// Status contains information about this Raft peer and its view of the system.
|
||||
// The Progress is only populated on the leader.
|
||||
type Status struct {
|
||||
BasicStatus
|
||||
Config tracker.Config
|
||||
Progress map[uint64]tracker.Progress
|
||||
}
|
||||
|
||||
// BasicStatus contains basic information about the Raft peer. It does not allocate.
|
||||
type BasicStatus struct {
|
||||
ID uint64
|
||||
|
||||
pb.HardState
|
||||
SoftState
|
||||
|
||||
Applied uint64
|
||||
|
||||
LeadTransferee uint64
|
||||
}
|
||||
|
||||
func getProgressCopy(r *raft) map[uint64]tracker.Progress {
|
||||
m := make(map[uint64]tracker.Progress)
|
||||
r.prs.Visit(func(id uint64, pr *tracker.Progress) {
|
||||
var p tracker.Progress
|
||||
p = *pr
|
||||
p.Inflights = pr.Inflights.Clone()
|
||||
pr = nil
|
||||
|
||||
m[id] = p
|
||||
})
|
||||
return m
|
||||
}
|
||||
|
||||
func getBasicStatus(r *raft) BasicStatus {
|
||||
s := BasicStatus{
|
||||
ID: r.id,
|
||||
LeadTransferee: r.leadTransferee,
|
||||
}
|
||||
s.HardState = r.hardState()
|
||||
s.SoftState = *r.softState()
|
||||
s.Applied = r.raftLog.applied
|
||||
return s
|
||||
}
|
||||
|
||||
// getStatus gets a copy of the current raft status.
|
||||
func getStatus(r *raft) Status {
|
||||
var s Status
|
||||
s.BasicStatus = getBasicStatus(r)
|
||||
if s.RaftState == StateLeader {
|
||||
s.Progress = getProgressCopy(r)
|
||||
}
|
||||
s.Config = r.prs.Config.Clone()
|
||||
return s
|
||||
}
|
||||
|
||||
// MarshalJSON translates the raft status into JSON.
|
||||
// TODO: try to simplify this by introducing ID type into raft
|
||||
func (s Status) MarshalJSON() ([]byte, error) {
|
||||
j := fmt.Sprintf(`{"id":"%x","term":%d,"vote":"%x","commit":%d,"lead":"%x","raftState":%q,"applied":%d,"progress":{`,
|
||||
s.ID, s.Term, s.Vote, s.Commit, s.Lead, s.RaftState, s.Applied)
|
||||
|
||||
if len(s.Progress) == 0 {
|
||||
j += "},"
|
||||
} else {
|
||||
for k, v := range s.Progress {
|
||||
subj := fmt.Sprintf(`"%x":{"match":%d,"next":%d,"state":%q},`, k, v.Match, v.Next, v.State)
|
||||
j += subj
|
||||
}
|
||||
// remove the trailing ","
|
||||
j = j[:len(j)-1] + "},"
|
||||
}
|
||||
|
||||
j += fmt.Sprintf(`"leadtransferee":"%x"}`, s.LeadTransferee)
|
||||
return []byte(j), nil
|
||||
}
|
||||
|
||||
func (s Status) String() string {
|
||||
b, err := s.MarshalJSON()
|
||||
if err != nil {
|
||||
raftLogger.Panicf("unexpected error: %v", err)
|
||||
}
|
||||
return string(b)
|
||||
}
|
273
vendor/go.etcd.io/etcd/raft/storage.go
generated
vendored
Normal file
273
vendor/go.etcd.io/etcd/raft/storage.go
generated
vendored
Normal file
@ -0,0 +1,273 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
// ErrCompacted is returned by Storage.Entries/Compact when a requested
|
||||
// index is unavailable because it predates the last snapshot.
|
||||
var ErrCompacted = errors.New("requested index is unavailable due to compaction")
|
||||
|
||||
// ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested
|
||||
// index is older than the existing snapshot.
|
||||
var ErrSnapOutOfDate = errors.New("requested index is older than the existing snapshot")
|
||||
|
||||
// ErrUnavailable is returned by Storage interface when the requested log entries
|
||||
// are unavailable.
|
||||
var ErrUnavailable = errors.New("requested entry at index is unavailable")
|
||||
|
||||
// ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required
|
||||
// snapshot is temporarily unavailable.
|
||||
var ErrSnapshotTemporarilyUnavailable = errors.New("snapshot is temporarily unavailable")
|
||||
|
||||
// Storage is an interface that may be implemented by the application
|
||||
// to retrieve log entries from storage.
|
||||
//
|
||||
// If any Storage method returns an error, the raft instance will
|
||||
// become inoperable and refuse to participate in elections; the
|
||||
// application is responsible for cleanup and recovery in this case.
|
||||
type Storage interface {
|
||||
// TODO(tbg): split this into two interfaces, LogStorage and StateStorage.
|
||||
|
||||
// InitialState returns the saved HardState and ConfState information.
|
||||
InitialState() (pb.HardState, pb.ConfState, error)
|
||||
// Entries returns a slice of log entries in the range [lo,hi).
|
||||
// MaxSize limits the total size of the log entries returned, but
|
||||
// Entries returns at least one entry if any.
|
||||
Entries(lo, hi, maxSize uint64) ([]pb.Entry, error)
|
||||
// Term returns the term of entry i, which must be in the range
|
||||
// [FirstIndex()-1, LastIndex()]. The term of the entry before
|
||||
// FirstIndex is retained for matching purposes even though the
|
||||
// rest of that entry may not be available.
|
||||
Term(i uint64) (uint64, error)
|
||||
// LastIndex returns the index of the last entry in the log.
|
||||
LastIndex() (uint64, error)
|
||||
// FirstIndex returns the index of the first log entry that is
|
||||
// possibly available via Entries (older entries have been incorporated
|
||||
// into the latest Snapshot; if storage only contains the dummy entry the
|
||||
// first log entry is not available).
|
||||
FirstIndex() (uint64, error)
|
||||
// Snapshot returns the most recent snapshot.
|
||||
// If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable,
|
||||
// so raft state machine could know that Storage needs some time to prepare
|
||||
// snapshot and call Snapshot later.
|
||||
Snapshot() (pb.Snapshot, error)
|
||||
}
|
||||
|
||||
// MemoryStorage implements the Storage interface backed by an
|
||||
// in-memory array.
|
||||
type MemoryStorage struct {
|
||||
// Protects access to all fields. Most methods of MemoryStorage are
|
||||
// run on the raft goroutine, but Append() is run on an application
|
||||
// goroutine.
|
||||
sync.Mutex
|
||||
|
||||
hardState pb.HardState
|
||||
snapshot pb.Snapshot
|
||||
// ents[i] has raft log position i+snapshot.Metadata.Index
|
||||
ents []pb.Entry
|
||||
}
|
||||
|
||||
// NewMemoryStorage creates an empty MemoryStorage.
|
||||
func NewMemoryStorage() *MemoryStorage {
|
||||
return &MemoryStorage{
|
||||
// When starting from scratch populate the list with a dummy entry at term zero.
|
||||
ents: make([]pb.Entry, 1),
|
||||
}
|
||||
}
|
||||
|
||||
// InitialState implements the Storage interface.
|
||||
func (ms *MemoryStorage) InitialState() (pb.HardState, pb.ConfState, error) {
|
||||
return ms.hardState, ms.snapshot.Metadata.ConfState, nil
|
||||
}
|
||||
|
||||
// SetHardState saves the current HardState.
|
||||
func (ms *MemoryStorage) SetHardState(st pb.HardState) error {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
ms.hardState = st
|
||||
return nil
|
||||
}
|
||||
|
||||
// Entries implements the Storage interface.
|
||||
func (ms *MemoryStorage) Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
offset := ms.ents[0].Index
|
||||
if lo <= offset {
|
||||
return nil, ErrCompacted
|
||||
}
|
||||
if hi > ms.lastIndex()+1 {
|
||||
raftLogger.Panicf("entries' hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
|
||||
}
|
||||
// only contains dummy entries.
|
||||
if len(ms.ents) == 1 {
|
||||
return nil, ErrUnavailable
|
||||
}
|
||||
|
||||
ents := ms.ents[lo-offset : hi-offset]
|
||||
return limitSize(ents, maxSize), nil
|
||||
}
|
||||
|
||||
// Term implements the Storage interface.
|
||||
func (ms *MemoryStorage) Term(i uint64) (uint64, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
offset := ms.ents[0].Index
|
||||
if i < offset {
|
||||
return 0, ErrCompacted
|
||||
}
|
||||
if int(i-offset) >= len(ms.ents) {
|
||||
return 0, ErrUnavailable
|
||||
}
|
||||
return ms.ents[i-offset].Term, nil
|
||||
}
|
||||
|
||||
// LastIndex implements the Storage interface.
|
||||
func (ms *MemoryStorage) LastIndex() (uint64, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
return ms.lastIndex(), nil
|
||||
}
|
||||
|
||||
func (ms *MemoryStorage) lastIndex() uint64 {
|
||||
return ms.ents[0].Index + uint64(len(ms.ents)) - 1
|
||||
}
|
||||
|
||||
// FirstIndex implements the Storage interface.
|
||||
func (ms *MemoryStorage) FirstIndex() (uint64, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
return ms.firstIndex(), nil
|
||||
}
|
||||
|
||||
func (ms *MemoryStorage) firstIndex() uint64 {
|
||||
return ms.ents[0].Index + 1
|
||||
}
|
||||
|
||||
// Snapshot implements the Storage interface.
|
||||
func (ms *MemoryStorage) Snapshot() (pb.Snapshot, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
return ms.snapshot, nil
|
||||
}
|
||||
|
||||
// ApplySnapshot overwrites the contents of this Storage object with
|
||||
// those of the given snapshot.
|
||||
func (ms *MemoryStorage) ApplySnapshot(snap pb.Snapshot) error {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
|
||||
//handle check for old snapshot being applied
|
||||
msIndex := ms.snapshot.Metadata.Index
|
||||
snapIndex := snap.Metadata.Index
|
||||
if msIndex >= snapIndex {
|
||||
return ErrSnapOutOfDate
|
||||
}
|
||||
|
||||
ms.snapshot = snap
|
||||
ms.ents = []pb.Entry{{Term: snap.Metadata.Term, Index: snap.Metadata.Index}}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and
|
||||
// can be used to reconstruct the state at that point.
|
||||
// If any configuration changes have been made since the last compaction,
|
||||
// the result of the last ApplyConfChange must be passed in.
|
||||
func (ms *MemoryStorage) CreateSnapshot(i uint64, cs *pb.ConfState, data []byte) (pb.Snapshot, error) {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
if i <= ms.snapshot.Metadata.Index {
|
||||
return pb.Snapshot{}, ErrSnapOutOfDate
|
||||
}
|
||||
|
||||
offset := ms.ents[0].Index
|
||||
if i > ms.lastIndex() {
|
||||
raftLogger.Panicf("snapshot %d is out of bound lastindex(%d)", i, ms.lastIndex())
|
||||
}
|
||||
|
||||
ms.snapshot.Metadata.Index = i
|
||||
ms.snapshot.Metadata.Term = ms.ents[i-offset].Term
|
||||
if cs != nil {
|
||||
ms.snapshot.Metadata.ConfState = *cs
|
||||
}
|
||||
ms.snapshot.Data = data
|
||||
return ms.snapshot, nil
|
||||
}
|
||||
|
||||
// Compact discards all log entries prior to compactIndex.
|
||||
// It is the application's responsibility to not attempt to compact an index
|
||||
// greater than raftLog.applied.
|
||||
func (ms *MemoryStorage) Compact(compactIndex uint64) error {
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
offset := ms.ents[0].Index
|
||||
if compactIndex <= offset {
|
||||
return ErrCompacted
|
||||
}
|
||||
if compactIndex > ms.lastIndex() {
|
||||
raftLogger.Panicf("compact %d is out of bound lastindex(%d)", compactIndex, ms.lastIndex())
|
||||
}
|
||||
|
||||
i := compactIndex - offset
|
||||
ents := make([]pb.Entry, 1, 1+uint64(len(ms.ents))-i)
|
||||
ents[0].Index = ms.ents[i].Index
|
||||
ents[0].Term = ms.ents[i].Term
|
||||
ents = append(ents, ms.ents[i+1:]...)
|
||||
ms.ents = ents
|
||||
return nil
|
||||
}
|
||||
|
||||
// Append the new entries to storage.
|
||||
// TODO (xiangli): ensure the entries are continuous and
|
||||
// entries[0].Index > ms.entries[0].Index
|
||||
func (ms *MemoryStorage) Append(entries []pb.Entry) error {
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
ms.Lock()
|
||||
defer ms.Unlock()
|
||||
|
||||
first := ms.firstIndex()
|
||||
last := entries[0].Index + uint64(len(entries)) - 1
|
||||
|
||||
// shortcut if there is no new entry.
|
||||
if last < first {
|
||||
return nil
|
||||
}
|
||||
// truncate compacted entries
|
||||
if first > entries[0].Index {
|
||||
entries = entries[first-entries[0].Index:]
|
||||
}
|
||||
|
||||
offset := entries[0].Index - ms.ents[0].Index
|
||||
switch {
|
||||
case uint64(len(ms.ents)) > offset:
|
||||
ms.ents = append([]pb.Entry{}, ms.ents[:offset]...)
|
||||
ms.ents = append(ms.ents, entries...)
|
||||
case uint64(len(ms.ents)) == offset:
|
||||
ms.ents = append(ms.ents, entries...)
|
||||
default:
|
||||
raftLogger.Panicf("missing log entry [last: %d, append at: %d]",
|
||||
ms.lastIndex(), entries[0].Index)
|
||||
}
|
||||
return nil
|
||||
}
|
132
vendor/go.etcd.io/etcd/raft/tracker/inflights.go
generated
vendored
Normal file
132
vendor/go.etcd.io/etcd/raft/tracker/inflights.go
generated
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracker
|
||||
|
||||
// Inflights limits the number of MsgApp (represented by the largest index
|
||||
// contained within) sent to followers but not yet acknowledged by them. Callers
|
||||
// use Full() to check whether more messages can be sent, call Add() whenever
|
||||
// they are sending a new append, and release "quota" via FreeLE() whenever an
|
||||
// ack is received.
|
||||
type Inflights struct {
|
||||
// the starting index in the buffer
|
||||
start int
|
||||
// number of inflights in the buffer
|
||||
count int
|
||||
|
||||
// the size of the buffer
|
||||
size int
|
||||
|
||||
// buffer contains the index of the last entry
|
||||
// inside one message.
|
||||
buffer []uint64
|
||||
}
|
||||
|
||||
// NewInflights sets up an Inflights that allows up to 'size' inflight messages.
|
||||
func NewInflights(size int) *Inflights {
|
||||
return &Inflights{
|
||||
size: size,
|
||||
}
|
||||
}
|
||||
|
||||
// Clone returns an *Inflights that is identical to but shares no memory with
|
||||
// the receiver.
|
||||
func (in *Inflights) Clone() *Inflights {
|
||||
ins := *in
|
||||
ins.buffer = append([]uint64(nil), in.buffer...)
|
||||
return &ins
|
||||
}
|
||||
|
||||
// Add notifies the Inflights that a new message with the given index is being
|
||||
// dispatched. Full() must be called prior to Add() to verify that there is room
|
||||
// for one more message, and consecutive calls to add Add() must provide a
|
||||
// monotonic sequence of indexes.
|
||||
func (in *Inflights) Add(inflight uint64) {
|
||||
if in.Full() {
|
||||
panic("cannot add into a Full inflights")
|
||||
}
|
||||
next := in.start + in.count
|
||||
size := in.size
|
||||
if next >= size {
|
||||
next -= size
|
||||
}
|
||||
if next >= len(in.buffer) {
|
||||
in.grow()
|
||||
}
|
||||
in.buffer[next] = inflight
|
||||
in.count++
|
||||
}
|
||||
|
||||
// grow the inflight buffer by doubling up to inflights.size. We grow on demand
|
||||
// instead of preallocating to inflights.size to handle systems which have
|
||||
// thousands of Raft groups per process.
|
||||
func (in *Inflights) grow() {
|
||||
newSize := len(in.buffer) * 2
|
||||
if newSize == 0 {
|
||||
newSize = 1
|
||||
} else if newSize > in.size {
|
||||
newSize = in.size
|
||||
}
|
||||
newBuffer := make([]uint64, newSize)
|
||||
copy(newBuffer, in.buffer)
|
||||
in.buffer = newBuffer
|
||||
}
|
||||
|
||||
// FreeLE frees the inflights smaller or equal to the given `to` flight.
|
||||
func (in *Inflights) FreeLE(to uint64) {
|
||||
if in.count == 0 || to < in.buffer[in.start] {
|
||||
// out of the left side of the window
|
||||
return
|
||||
}
|
||||
|
||||
idx := in.start
|
||||
var i int
|
||||
for i = 0; i < in.count; i++ {
|
||||
if to < in.buffer[idx] { // found the first large inflight
|
||||
break
|
||||
}
|
||||
|
||||
// increase index and maybe rotate
|
||||
size := in.size
|
||||
if idx++; idx >= size {
|
||||
idx -= size
|
||||
}
|
||||
}
|
||||
// free i inflights and set new start index
|
||||
in.count -= i
|
||||
in.start = idx
|
||||
if in.count == 0 {
|
||||
// inflights is empty, reset the start index so that we don't grow the
|
||||
// buffer unnecessarily.
|
||||
in.start = 0
|
||||
}
|
||||
}
|
||||
|
||||
// FreeFirstOne releases the first inflight. This is a no-op if nothing is
|
||||
// inflight.
|
||||
func (in *Inflights) FreeFirstOne() { in.FreeLE(in.buffer[in.start]) }
|
||||
|
||||
// Full returns true if no more messages can be sent at the moment.
|
||||
func (in *Inflights) Full() bool {
|
||||
return in.count == in.size
|
||||
}
|
||||
|
||||
// Count returns the number of inflight messages.
|
||||
func (in *Inflights) Count() int { return in.count }
|
||||
|
||||
// reset frees all inflights.
|
||||
func (in *Inflights) reset() {
|
||||
in.count = 0
|
||||
in.start = 0
|
||||
}
|
259
vendor/go.etcd.io/etcd/raft/tracker/progress.go
generated
vendored
Normal file
259
vendor/go.etcd.io/etcd/raft/tracker/progress.go
generated
vendored
Normal file
@ -0,0 +1,259 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracker
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Progress represents a follower’s progress in the view of the leader. Leader
|
||||
// maintains progresses of all followers, and sends entries to the follower
|
||||
// based on its progress.
|
||||
//
|
||||
// NB(tbg): Progress is basically a state machine whose transitions are mostly
|
||||
// strewn around `*raft.raft`. Additionally, some fields are only used when in a
|
||||
// certain State. All of this isn't ideal.
|
||||
type Progress struct {
|
||||
Match, Next uint64
|
||||
// State defines how the leader should interact with the follower.
|
||||
//
|
||||
// When in StateProbe, leader sends at most one replication message
|
||||
// per heartbeat interval. It also probes actual progress of the follower.
|
||||
//
|
||||
// When in StateReplicate, leader optimistically increases next
|
||||
// to the latest entry sent after sending replication message. This is
|
||||
// an optimized state for fast replicating log entries to the follower.
|
||||
//
|
||||
// When in StateSnapshot, leader should have sent out snapshot
|
||||
// before and stops sending any replication message.
|
||||
State StateType
|
||||
|
||||
// PendingSnapshot is used in StateSnapshot.
|
||||
// If there is a pending snapshot, the pendingSnapshot will be set to the
|
||||
// index of the snapshot. If pendingSnapshot is set, the replication process of
|
||||
// this Progress will be paused. raft will not resend snapshot until the pending one
|
||||
// is reported to be failed.
|
||||
PendingSnapshot uint64
|
||||
|
||||
// RecentActive is true if the progress is recently active. Receiving any messages
|
||||
// from the corresponding follower indicates the progress is active.
|
||||
// RecentActive can be reset to false after an election timeout.
|
||||
//
|
||||
// TODO(tbg): the leader should always have this set to true.
|
||||
RecentActive bool
|
||||
|
||||
// ProbeSent is used while this follower is in StateProbe. When ProbeSent is
|
||||
// true, raft should pause sending replication message to this peer until
|
||||
// ProbeSent is reset. See ProbeAcked() and IsPaused().
|
||||
ProbeSent bool
|
||||
|
||||
// Inflights is a sliding window for the inflight messages.
|
||||
// Each inflight message contains one or more log entries.
|
||||
// The max number of entries per message is defined in raft config as MaxSizePerMsg.
|
||||
// Thus inflight effectively limits both the number of inflight messages
|
||||
// and the bandwidth each Progress can use.
|
||||
// When inflights is Full, no more message should be sent.
|
||||
// When a leader sends out a message, the index of the last
|
||||
// entry should be added to inflights. The index MUST be added
|
||||
// into inflights in order.
|
||||
// When a leader receives a reply, the previous inflights should
|
||||
// be freed by calling inflights.FreeLE with the index of the last
|
||||
// received entry.
|
||||
Inflights *Inflights
|
||||
|
||||
// IsLearner is true if this progress is tracked for a learner.
|
||||
IsLearner bool
|
||||
}
|
||||
|
||||
// ResetState moves the Progress into the specified State, resetting ProbeSent,
|
||||
// PendingSnapshot, and Inflights.
|
||||
func (pr *Progress) ResetState(state StateType) {
|
||||
pr.ProbeSent = false
|
||||
pr.PendingSnapshot = 0
|
||||
pr.State = state
|
||||
pr.Inflights.reset()
|
||||
}
|
||||
|
||||
func max(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func min(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ProbeAcked is called when this peer has accepted an append. It resets
|
||||
// ProbeSent to signal that additional append messages should be sent without
|
||||
// further delay.
|
||||
func (pr *Progress) ProbeAcked() {
|
||||
pr.ProbeSent = false
|
||||
}
|
||||
|
||||
// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
|
||||
// optionally and if larger, the index of the pending snapshot.
|
||||
func (pr *Progress) BecomeProbe() {
|
||||
// If the original state is StateSnapshot, progress knows that
|
||||
// the pending snapshot has been sent to this peer successfully, then
|
||||
// probes from pendingSnapshot + 1.
|
||||
if pr.State == StateSnapshot {
|
||||
pendingSnapshot := pr.PendingSnapshot
|
||||
pr.ResetState(StateProbe)
|
||||
pr.Next = max(pr.Match+1, pendingSnapshot+1)
|
||||
} else {
|
||||
pr.ResetState(StateProbe)
|
||||
pr.Next = pr.Match + 1
|
||||
}
|
||||
}
|
||||
|
||||
// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
|
||||
func (pr *Progress) BecomeReplicate() {
|
||||
pr.ResetState(StateReplicate)
|
||||
pr.Next = pr.Match + 1
|
||||
}
|
||||
|
||||
// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending
|
||||
// snapshot index.
|
||||
func (pr *Progress) BecomeSnapshot(snapshoti uint64) {
|
||||
pr.ResetState(StateSnapshot)
|
||||
pr.PendingSnapshot = snapshoti
|
||||
}
|
||||
|
||||
// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the
|
||||
// index acked by it. The method returns false if the given n index comes from
|
||||
// an outdated message. Otherwise it updates the progress and returns true.
|
||||
func (pr *Progress) MaybeUpdate(n uint64) bool {
|
||||
var updated bool
|
||||
if pr.Match < n {
|
||||
pr.Match = n
|
||||
updated = true
|
||||
pr.ProbeAcked()
|
||||
}
|
||||
if pr.Next < n+1 {
|
||||
pr.Next = n + 1
|
||||
}
|
||||
return updated
|
||||
}
|
||||
|
||||
// OptimisticUpdate signals that appends all the way up to and including index n
|
||||
// are in-flight. As a result, Next is increased to n+1.
|
||||
func (pr *Progress) OptimisticUpdate(n uint64) { pr.Next = n + 1 }
|
||||
|
||||
// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
|
||||
// arguments are the index the follower rejected to append to its log, and its
|
||||
// last index.
|
||||
//
|
||||
// Rejections can happen spuriously as messages are sent out of order or
|
||||
// duplicated. In such cases, the rejection pertains to an index that the
|
||||
// Progress already knows were previously acknowledged, and false is returned
|
||||
// without changing the Progress.
|
||||
//
|
||||
// If the rejection is genuine, Next is lowered sensibly, and the Progress is
|
||||
// cleared for sending log entries.
|
||||
func (pr *Progress) MaybeDecrTo(rejected, last uint64) bool {
|
||||
if pr.State == StateReplicate {
|
||||
// The rejection must be stale if the progress has matched and "rejected"
|
||||
// is smaller than "match".
|
||||
if rejected <= pr.Match {
|
||||
return false
|
||||
}
|
||||
// Directly decrease next to match + 1.
|
||||
//
|
||||
// TODO(tbg): why not use last if it's larger?
|
||||
pr.Next = pr.Match + 1
|
||||
return true
|
||||
}
|
||||
|
||||
// The rejection must be stale if "rejected" does not match next - 1. This
|
||||
// is because non-replicating followers are probed one entry at a time.
|
||||
if pr.Next-1 != rejected {
|
||||
return false
|
||||
}
|
||||
|
||||
if pr.Next = min(rejected, last+1); pr.Next < 1 {
|
||||
pr.Next = 1
|
||||
}
|
||||
pr.ProbeSent = false
|
||||
return true
|
||||
}
|
||||
|
||||
// IsPaused returns whether sending log entries to this node has been throttled.
|
||||
// This is done when a node has rejected recent MsgApps, is currently waiting
|
||||
// for a snapshot, or has reached the MaxInflightMsgs limit. In normal
|
||||
// operation, this is false. A throttled node will be contacted less frequently
|
||||
// until it has reached a state in which it's able to accept a steady stream of
|
||||
// log entries again.
|
||||
func (pr *Progress) IsPaused() bool {
|
||||
switch pr.State {
|
||||
case StateProbe:
|
||||
return pr.ProbeSent
|
||||
case StateReplicate:
|
||||
return pr.Inflights.Full()
|
||||
case StateSnapshot:
|
||||
return true
|
||||
default:
|
||||
panic("unexpected state")
|
||||
}
|
||||
}
|
||||
|
||||
func (pr *Progress) String() string {
|
||||
var buf strings.Builder
|
||||
fmt.Fprintf(&buf, "%s match=%d next=%d", pr.State, pr.Match, pr.Next)
|
||||
if pr.IsLearner {
|
||||
fmt.Fprint(&buf, " learner")
|
||||
}
|
||||
if pr.IsPaused() {
|
||||
fmt.Fprint(&buf, " paused")
|
||||
}
|
||||
if pr.PendingSnapshot > 0 {
|
||||
fmt.Fprintf(&buf, " pendingSnap=%d", pr.PendingSnapshot)
|
||||
}
|
||||
if !pr.RecentActive {
|
||||
fmt.Fprintf(&buf, " inactive")
|
||||
}
|
||||
if n := pr.Inflights.Count(); n > 0 {
|
||||
fmt.Fprintf(&buf, " inflight=%d", n)
|
||||
if pr.Inflights.Full() {
|
||||
fmt.Fprint(&buf, "[full]")
|
||||
}
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// ProgressMap is a map of *Progress.
|
||||
type ProgressMap map[uint64]*Progress
|
||||
|
||||
// String prints the ProgressMap in sorted key order, one Progress per line.
|
||||
func (m ProgressMap) String() string {
|
||||
ids := make([]uint64, 0, len(m))
|
||||
for k := range m {
|
||||
ids = append(ids, k)
|
||||
}
|
||||
sort.Slice(ids, func(i, j int) bool {
|
||||
return ids[i] < ids[j]
|
||||
})
|
||||
var buf strings.Builder
|
||||
for _, id := range ids {
|
||||
fmt.Fprintf(&buf, "%d: %s\n", id, m[id])
|
||||
}
|
||||
return buf.String()
|
||||
}
|
42
vendor/go.etcd.io/etcd/raft/tracker/state.go
generated
vendored
Normal file
42
vendor/go.etcd.io/etcd/raft/tracker/state.go
generated
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracker
|
||||
|
||||
// StateType is the state of a tracked follower.
|
||||
type StateType uint64
|
||||
|
||||
const (
|
||||
// StateProbe indicates a follower whose last index isn't known. Such a
|
||||
// follower is "probed" (i.e. an append sent periodically) to narrow down
|
||||
// its last index. In the ideal (and common) case, only one round of probing
|
||||
// is necessary as the follower will react with a hint. Followers that are
|
||||
// probed over extended periods of time are often offline.
|
||||
StateProbe StateType = iota
|
||||
// StateReplicate is the state steady in which a follower eagerly receives
|
||||
// log entries to append to its log.
|
||||
StateReplicate
|
||||
// StateSnapshot indicates a follower that needs log entries not available
|
||||
// from the leader's Raft log. Such a follower needs a full snapshot to
|
||||
// return to StateReplicate.
|
||||
StateSnapshot
|
||||
)
|
||||
|
||||
var prstmap = [...]string{
|
||||
"StateProbe",
|
||||
"StateReplicate",
|
||||
"StateSnapshot",
|
||||
}
|
||||
|
||||
func (st StateType) String() string { return prstmap[uint64(st)] }
|
288
vendor/go.etcd.io/etcd/raft/tracker/tracker.go
generated
vendored
Normal file
288
vendor/go.etcd.io/etcd/raft/tracker/tracker.go
generated
vendored
Normal file
@ -0,0 +1,288 @@
|
||||
// Copyright 2019 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracker
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"go.etcd.io/etcd/raft/quorum"
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
// Config reflects the configuration tracked in a ProgressTracker.
|
||||
type Config struct {
|
||||
Voters quorum.JointConfig
|
||||
// AutoLeave is true if the configuration is joint and a transition to the
|
||||
// incoming configuration should be carried out automatically by Raft when
|
||||
// this is possible. If false, the configuration will be joint until the
|
||||
// application initiates the transition manually.
|
||||
AutoLeave bool
|
||||
// Learners is a set of IDs corresponding to the learners active in the
|
||||
// current configuration.
|
||||
//
|
||||
// Invariant: Learners and Voters does not intersect, i.e. if a peer is in
|
||||
// either half of the joint config, it can't be a learner; if it is a
|
||||
// learner it can't be in either half of the joint config. This invariant
|
||||
// simplifies the implementation since it allows peers to have clarity about
|
||||
// its current role without taking into account joint consensus.
|
||||
Learners map[uint64]struct{}
|
||||
// When we turn a voter into a learner during a joint consensus transition,
|
||||
// we cannot add the learner directly when entering the joint state. This is
|
||||
// because this would violate the invariant that the intersection of
|
||||
// voters and learners is empty. For example, assume a Voter is removed and
|
||||
// immediately re-added as a learner (or in other words, it is demoted):
|
||||
//
|
||||
// Initially, the configuration will be
|
||||
//
|
||||
// voters: {1 2 3}
|
||||
// learners: {}
|
||||
//
|
||||
// and we want to demote 3. Entering the joint configuration, we naively get
|
||||
//
|
||||
// voters: {1 2} & {1 2 3}
|
||||
// learners: {3}
|
||||
//
|
||||
// but this violates the invariant (3 is both voter and learner). Instead,
|
||||
// we get
|
||||
//
|
||||
// voters: {1 2} & {1 2 3}
|
||||
// learners: {}
|
||||
// next_learners: {3}
|
||||
//
|
||||
// Where 3 is now still purely a voter, but we are remembering the intention
|
||||
// to make it a learner upon transitioning into the final configuration:
|
||||
//
|
||||
// voters: {1 2}
|
||||
// learners: {3}
|
||||
// next_learners: {}
|
||||
//
|
||||
// Note that next_learners is not used while adding a learner that is not
|
||||
// also a voter in the joint config. In this case, the learner is added
|
||||
// right away when entering the joint configuration, so that it is caught up
|
||||
// as soon as possible.
|
||||
LearnersNext map[uint64]struct{}
|
||||
}
|
||||
|
||||
func (c Config) String() string {
|
||||
var buf strings.Builder
|
||||
fmt.Fprintf(&buf, "voters=%s", c.Voters)
|
||||
if c.Learners != nil {
|
||||
fmt.Fprintf(&buf, " learners=%s", quorum.MajorityConfig(c.Learners).String())
|
||||
}
|
||||
if c.LearnersNext != nil {
|
||||
fmt.Fprintf(&buf, " learners_next=%s", quorum.MajorityConfig(c.LearnersNext).String())
|
||||
}
|
||||
if c.AutoLeave {
|
||||
fmt.Fprintf(&buf, " autoleave")
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// Clone returns a copy of the Config that shares no memory with the original.
|
||||
func (c *Config) Clone() Config {
|
||||
clone := func(m map[uint64]struct{}) map[uint64]struct{} {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
mm := make(map[uint64]struct{}, len(m))
|
||||
for k := range m {
|
||||
mm[k] = struct{}{}
|
||||
}
|
||||
return mm
|
||||
}
|
||||
return Config{
|
||||
Voters: quorum.JointConfig{clone(c.Voters[0]), clone(c.Voters[1])},
|
||||
Learners: clone(c.Learners),
|
||||
LearnersNext: clone(c.LearnersNext),
|
||||
}
|
||||
}
|
||||
|
||||
// ProgressTracker tracks the currently active configuration and the information
|
||||
// known about the nodes and learners in it. In particular, it tracks the match
|
||||
// index for each peer which in turn allows reasoning about the committed index.
|
||||
type ProgressTracker struct {
|
||||
Config
|
||||
|
||||
Progress ProgressMap
|
||||
|
||||
Votes map[uint64]bool
|
||||
|
||||
MaxInflight int
|
||||
}
|
||||
|
||||
// MakeProgressTracker initializes a ProgressTracker.
|
||||
func MakeProgressTracker(maxInflight int) ProgressTracker {
|
||||
p := ProgressTracker{
|
||||
MaxInflight: maxInflight,
|
||||
Config: Config{
|
||||
Voters: quorum.JointConfig{
|
||||
quorum.MajorityConfig{},
|
||||
nil, // only populated when used
|
||||
},
|
||||
Learners: nil, // only populated when used
|
||||
LearnersNext: nil, // only populated when used
|
||||
},
|
||||
Votes: map[uint64]bool{},
|
||||
Progress: map[uint64]*Progress{},
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// ConfState returns a ConfState representing the active configuration.
|
||||
func (p *ProgressTracker) ConfState() pb.ConfState {
|
||||
return pb.ConfState{
|
||||
Voters: p.Voters[0].Slice(),
|
||||
VotersOutgoing: p.Voters[1].Slice(),
|
||||
Learners: quorum.MajorityConfig(p.Learners).Slice(),
|
||||
LearnersNext: quorum.MajorityConfig(p.LearnersNext).Slice(),
|
||||
AutoLeave: p.AutoLeave,
|
||||
}
|
||||
}
|
||||
|
||||
// IsSingleton returns true if (and only if) there is only one voting member
|
||||
// (i.e. the leader) in the current configuration.
|
||||
func (p *ProgressTracker) IsSingleton() bool {
|
||||
return len(p.Voters[0]) == 1 && len(p.Voters[1]) == 0
|
||||
}
|
||||
|
||||
type matchAckIndexer map[uint64]*Progress
|
||||
|
||||
var _ quorum.AckedIndexer = matchAckIndexer(nil)
|
||||
|
||||
// AckedIndex implements IndexLookuper.
|
||||
func (l matchAckIndexer) AckedIndex(id uint64) (quorum.Index, bool) {
|
||||
pr, ok := l[id]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
return quorum.Index(pr.Match), true
|
||||
}
|
||||
|
||||
// Committed returns the largest log index known to be committed based on what
|
||||
// the voting members of the group have acknowledged.
|
||||
func (p *ProgressTracker) Committed() uint64 {
|
||||
return uint64(p.Voters.CommittedIndex(matchAckIndexer(p.Progress)))
|
||||
}
|
||||
|
||||
func insertionSort(sl []uint64) {
|
||||
a, b := 0, len(sl)
|
||||
for i := a + 1; i < b; i++ {
|
||||
for j := i; j > a && sl[j] < sl[j-1]; j-- {
|
||||
sl[j], sl[j-1] = sl[j-1], sl[j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Visit invokes the supplied closure for all tracked progresses in stable order.
|
||||
func (p *ProgressTracker) Visit(f func(id uint64, pr *Progress)) {
|
||||
n := len(p.Progress)
|
||||
// We need to sort the IDs and don't want to allocate since this is hot code.
|
||||
// The optimization here mirrors that in `(MajorityConfig).CommittedIndex`,
|
||||
// see there for details.
|
||||
var sl [7]uint64
|
||||
ids := sl[:]
|
||||
if len(sl) >= n {
|
||||
ids = sl[:n]
|
||||
} else {
|
||||
ids = make([]uint64, n)
|
||||
}
|
||||
for id := range p.Progress {
|
||||
n--
|
||||
ids[n] = id
|
||||
}
|
||||
insertionSort(ids)
|
||||
for _, id := range ids {
|
||||
f(id, p.Progress[id])
|
||||
}
|
||||
}
|
||||
|
||||
// QuorumActive returns true if the quorum is active from the view of the local
|
||||
// raft state machine. Otherwise, it returns false.
|
||||
func (p *ProgressTracker) QuorumActive() bool {
|
||||
votes := map[uint64]bool{}
|
||||
p.Visit(func(id uint64, pr *Progress) {
|
||||
if pr.IsLearner {
|
||||
return
|
||||
}
|
||||
votes[id] = pr.RecentActive
|
||||
})
|
||||
|
||||
return p.Voters.VoteResult(votes) == quorum.VoteWon
|
||||
}
|
||||
|
||||
// VoterNodes returns a sorted slice of voters.
|
||||
func (p *ProgressTracker) VoterNodes() []uint64 {
|
||||
m := p.Voters.IDs()
|
||||
nodes := make([]uint64, 0, len(m))
|
||||
for id := range m {
|
||||
nodes = append(nodes, id)
|
||||
}
|
||||
sort.Slice(nodes, func(i, j int) bool { return nodes[i] < nodes[j] })
|
||||
return nodes
|
||||
}
|
||||
|
||||
// LearnerNodes returns a sorted slice of learners.
|
||||
func (p *ProgressTracker) LearnerNodes() []uint64 {
|
||||
if len(p.Learners) == 0 {
|
||||
return nil
|
||||
}
|
||||
nodes := make([]uint64, 0, len(p.Learners))
|
||||
for id := range p.Learners {
|
||||
nodes = append(nodes, id)
|
||||
}
|
||||
sort.Slice(nodes, func(i, j int) bool { return nodes[i] < nodes[j] })
|
||||
return nodes
|
||||
}
|
||||
|
||||
// ResetVotes prepares for a new round of vote counting via recordVote.
|
||||
func (p *ProgressTracker) ResetVotes() {
|
||||
p.Votes = map[uint64]bool{}
|
||||
}
|
||||
|
||||
// RecordVote records that the node with the given id voted for this Raft
|
||||
// instance if v == true (and declined it otherwise).
|
||||
func (p *ProgressTracker) RecordVote(id uint64, v bool) {
|
||||
_, ok := p.Votes[id]
|
||||
if !ok {
|
||||
p.Votes[id] = v
|
||||
}
|
||||
}
|
||||
|
||||
// TallyVotes returns the number of granted and rejected Votes, and whether the
|
||||
// election outcome is known.
|
||||
func (p *ProgressTracker) TallyVotes() (granted int, rejected int, _ quorum.VoteResult) {
|
||||
// Make sure to populate granted/rejected correctly even if the Votes slice
|
||||
// contains members no longer part of the configuration. This doesn't really
|
||||
// matter in the way the numbers are used (they're informational), but might
|
||||
// as well get it right.
|
||||
for id, pr := range p.Progress {
|
||||
if pr.IsLearner {
|
||||
continue
|
||||
}
|
||||
v, voted := p.Votes[id]
|
||||
if !voted {
|
||||
continue
|
||||
}
|
||||
if v {
|
||||
granted++
|
||||
} else {
|
||||
rejected++
|
||||
}
|
||||
}
|
||||
result := p.Voters.VoteResult(p.Votes)
|
||||
return granted, rejected, result
|
||||
}
|
233
vendor/go.etcd.io/etcd/raft/util.go
generated
vendored
Normal file
233
vendor/go.etcd.io/etcd/raft/util.go
generated
vendored
Normal file
@ -0,0 +1,233 @@
|
||||
// Copyright 2015 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package raft
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
pb "go.etcd.io/etcd/raft/raftpb"
|
||||
)
|
||||
|
||||
func (st StateType) MarshalJSON() ([]byte, error) {
|
||||
return []byte(fmt.Sprintf("%q", st.String())), nil
|
||||
}
|
||||
|
||||
func min(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
func max(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func IsLocalMsg(msgt pb.MessageType) bool {
|
||||
return msgt == pb.MsgHup || msgt == pb.MsgBeat || msgt == pb.MsgUnreachable ||
|
||||
msgt == pb.MsgSnapStatus || msgt == pb.MsgCheckQuorum
|
||||
}
|
||||
|
||||
func IsResponseMsg(msgt pb.MessageType) bool {
|
||||
return msgt == pb.MsgAppResp || msgt == pb.MsgVoteResp || msgt == pb.MsgHeartbeatResp || msgt == pb.MsgUnreachable || msgt == pb.MsgPreVoteResp
|
||||
}
|
||||
|
||||
// voteResponseType maps vote and prevote message types to their corresponding responses.
|
||||
func voteRespMsgType(msgt pb.MessageType) pb.MessageType {
|
||||
switch msgt {
|
||||
case pb.MsgVote:
|
||||
return pb.MsgVoteResp
|
||||
case pb.MsgPreVote:
|
||||
return pb.MsgPreVoteResp
|
||||
default:
|
||||
panic(fmt.Sprintf("not a vote message: %s", msgt))
|
||||
}
|
||||
}
|
||||
|
||||
func DescribeHardState(hs pb.HardState) string {
|
||||
var buf strings.Builder
|
||||
fmt.Fprintf(&buf, "Term:%d", hs.Term)
|
||||
if hs.Vote != 0 {
|
||||
fmt.Fprintf(&buf, " Vote:%d", hs.Vote)
|
||||
}
|
||||
fmt.Fprintf(&buf, " Commit:%d", hs.Commit)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func DescribeSoftState(ss SoftState) string {
|
||||
return fmt.Sprintf("Lead:%d State:%s", ss.Lead, ss.RaftState)
|
||||
}
|
||||
|
||||
func DescribeConfState(state pb.ConfState) string {
|
||||
return fmt.Sprintf(
|
||||
"Voters:%v VotersOutgoing:%v Learners:%v LearnersNext:%v AutoLeave:%v",
|
||||
state.Voters, state.VotersOutgoing, state.Learners, state.LearnersNext, state.AutoLeave,
|
||||
)
|
||||
}
|
||||
|
||||
func DescribeSnapshot(snap pb.Snapshot) string {
|
||||
m := snap.Metadata
|
||||
return fmt.Sprintf("Index:%d Term:%d ConfState:%s", m.Index, m.Term, DescribeConfState(m.ConfState))
|
||||
}
|
||||
|
||||
func DescribeReady(rd Ready, f EntryFormatter) string {
|
||||
var buf strings.Builder
|
||||
if rd.SoftState != nil {
|
||||
fmt.Fprint(&buf, DescribeSoftState(*rd.SoftState))
|
||||
buf.WriteByte('\n')
|
||||
}
|
||||
if !IsEmptyHardState(rd.HardState) {
|
||||
fmt.Fprintf(&buf, "HardState %s", DescribeHardState(rd.HardState))
|
||||
buf.WriteByte('\n')
|
||||
}
|
||||
if len(rd.ReadStates) > 0 {
|
||||
fmt.Fprintf(&buf, "ReadStates %v\n", rd.ReadStates)
|
||||
}
|
||||
if len(rd.Entries) > 0 {
|
||||
buf.WriteString("Entries:\n")
|
||||
fmt.Fprint(&buf, DescribeEntries(rd.Entries, f))
|
||||
}
|
||||
if !IsEmptySnap(rd.Snapshot) {
|
||||
fmt.Fprintf(&buf, "Snapshot %s\n", DescribeSnapshot(rd.Snapshot))
|
||||
}
|
||||
if len(rd.CommittedEntries) > 0 {
|
||||
buf.WriteString("CommittedEntries:\n")
|
||||
fmt.Fprint(&buf, DescribeEntries(rd.CommittedEntries, f))
|
||||
}
|
||||
if len(rd.Messages) > 0 {
|
||||
buf.WriteString("Messages:\n")
|
||||
for _, msg := range rd.Messages {
|
||||
fmt.Fprint(&buf, DescribeMessage(msg, f))
|
||||
buf.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
if buf.Len() > 0 {
|
||||
return fmt.Sprintf("Ready MustSync=%t:\n%s", rd.MustSync, buf.String())
|
||||
}
|
||||
return "<empty Ready>"
|
||||
}
|
||||
|
||||
// EntryFormatter can be implemented by the application to provide human-readable formatting
|
||||
// of entry data. Nil is a valid EntryFormatter and will use a default format.
|
||||
type EntryFormatter func([]byte) string
|
||||
|
||||
// DescribeMessage returns a concise human-readable description of a
|
||||
// Message for debugging.
|
||||
func DescribeMessage(m pb.Message, f EntryFormatter) string {
|
||||
var buf bytes.Buffer
|
||||
fmt.Fprintf(&buf, "%x->%x %v Term:%d Log:%d/%d", m.From, m.To, m.Type, m.Term, m.LogTerm, m.Index)
|
||||
if m.Reject {
|
||||
fmt.Fprintf(&buf, " Rejected (Hint: %d)", m.RejectHint)
|
||||
}
|
||||
if m.Commit != 0 {
|
||||
fmt.Fprintf(&buf, " Commit:%d", m.Commit)
|
||||
}
|
||||
if len(m.Entries) > 0 {
|
||||
fmt.Fprintf(&buf, " Entries:[")
|
||||
for i, e := range m.Entries {
|
||||
if i != 0 {
|
||||
buf.WriteString(", ")
|
||||
}
|
||||
buf.WriteString(DescribeEntry(e, f))
|
||||
}
|
||||
fmt.Fprintf(&buf, "]")
|
||||
}
|
||||
if !IsEmptySnap(m.Snapshot) {
|
||||
fmt.Fprintf(&buf, " Snapshot: %s", DescribeSnapshot(m.Snapshot))
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// PayloadSize is the size of the payload of this Entry. Notably, it does not
|
||||
// depend on its Index or Term.
|
||||
func PayloadSize(e pb.Entry) int {
|
||||
return len(e.Data)
|
||||
}
|
||||
|
||||
// DescribeEntry returns a concise human-readable description of an
|
||||
// Entry for debugging.
|
||||
func DescribeEntry(e pb.Entry, f EntryFormatter) string {
|
||||
if f == nil {
|
||||
f = func(data []byte) string { return fmt.Sprintf("%q", data) }
|
||||
}
|
||||
|
||||
formatConfChange := func(cc pb.ConfChangeI) string {
|
||||
// TODO(tbg): give the EntryFormatter a type argument so that it gets
|
||||
// a chance to expose the Context.
|
||||
return pb.ConfChangesToString(cc.AsV2().Changes)
|
||||
}
|
||||
|
||||
var formatted string
|
||||
switch e.Type {
|
||||
case pb.EntryNormal:
|
||||
formatted = f(e.Data)
|
||||
case pb.EntryConfChange:
|
||||
var cc pb.ConfChange
|
||||
if err := cc.Unmarshal(e.Data); err != nil {
|
||||
formatted = err.Error()
|
||||
} else {
|
||||
formatted = formatConfChange(cc)
|
||||
}
|
||||
case pb.EntryConfChangeV2:
|
||||
var cc pb.ConfChangeV2
|
||||
if err := cc.Unmarshal(e.Data); err != nil {
|
||||
formatted = err.Error()
|
||||
} else {
|
||||
formatted = formatConfChange(cc)
|
||||
}
|
||||
}
|
||||
if formatted != "" {
|
||||
formatted = " " + formatted
|
||||
}
|
||||
return fmt.Sprintf("%d/%d %s%s", e.Term, e.Index, e.Type, formatted)
|
||||
}
|
||||
|
||||
// DescribeEntries calls DescribeEntry for each Entry, adding a newline to
|
||||
// each.
|
||||
func DescribeEntries(ents []pb.Entry, f EntryFormatter) string {
|
||||
var buf bytes.Buffer
|
||||
for _, e := range ents {
|
||||
_, _ = buf.WriteString(DescribeEntry(e, f) + "\n")
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func limitSize(ents []pb.Entry, maxSize uint64) []pb.Entry {
|
||||
if len(ents) == 0 {
|
||||
return ents
|
||||
}
|
||||
size := ents[0].Size()
|
||||
var limit int
|
||||
for limit = 1; limit < len(ents); limit++ {
|
||||
size += ents[limit].Size()
|
||||
if uint64(size) > maxSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
return ents[:limit]
|
||||
}
|
||||
|
||||
func assertConfStatesEquivalent(l Logger, cs1, cs2 pb.ConfState) {
|
||||
err := cs1.Equivalent(cs2)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
l.Panic(err)
|
||||
}
|
Reference in New Issue
Block a user