Update to kube v1.17

Signed-off-by: Humble Chirammal <hchiramm@redhat.com>
This commit is contained in:
Humble Chirammal
2020-01-14 16:08:55 +05:30
committed by mergify[bot]
parent 327fcd1b1b
commit 3af1e26d7c
1710 changed files with 289562 additions and 168638 deletions

80
vendor/go.etcd.io/etcd/raft/bootstrap.go generated vendored Normal file
View File

@ -0,0 +1,80 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
pb "go.etcd.io/etcd/raft/raftpb"
)
// Bootstrap initializes the RawNode for first use by appending configuration
// changes for the supplied peers. This method returns an error if the Storage
// is nonempty.
//
// It is recommended that instead of calling this method, applications bootstrap
// their state manually by setting up a Storage that has a first index > 1 and
// which stores the desired ConfState as its InitialState.
func (rn *RawNode) Bootstrap(peers []Peer) error {
if len(peers) == 0 {
return errors.New("must provide at least one peer to Bootstrap")
}
lastIndex, err := rn.raft.raftLog.storage.LastIndex()
if err != nil {
return err
}
if lastIndex != 0 {
return errors.New("can't bootstrap a nonempty Storage")
}
// We've faked out initial entries above, but nothing has been
// persisted. Start with an empty HardState (thus the first Ready will
// emit a HardState update for the app to persist).
rn.prevHardSt = emptyState
// TODO(tbg): remove StartNode and give the application the right tools to
// bootstrap the initial membership in a cleaner way.
rn.raft.becomeFollower(1, None)
ents := make([]pb.Entry, len(peers))
for i, peer := range peers {
cc := pb.ConfChange{Type: pb.ConfChangeAddNode, NodeID: peer.ID, Context: peer.Context}
data, err := cc.Marshal()
if err != nil {
return err
}
ents[i] = pb.Entry{Type: pb.EntryConfChange, Term: 1, Index: uint64(i + 1), Data: data}
}
rn.raft.raftLog.append(ents...)
// Now apply them, mainly so that the application can call Campaign
// immediately after StartNode in tests. Note that these nodes will
// be added to raft twice: here and when the application's Ready
// loop calls ApplyConfChange. The calls to addNode must come after
// all calls to raftLog.append so progress.next is set after these
// bootstrapping entries (it is an error if we try to append these
// entries since they have already been committed).
// We do not set raftLog.applied so the application will be able
// to observe all conf changes via Ready.CommittedEntries.
//
// TODO(bdarnell): These entries are still unstable; do we need to preserve
// the invariant that committed < unstable?
rn.raft.raftLog.committed = uint64(len(ents))
for _, peer := range peers {
rn.raft.applyConfChange(pb.ConfChange{NodeID: peer.ID, Type: pb.ConfChangeAddNode}.AsV2())
}
return nil
}

425
vendor/go.etcd.io/etcd/raft/confchange/confchange.go generated vendored Normal file
View File

@ -0,0 +1,425 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package confchange
import (
"errors"
"fmt"
"strings"
"go.etcd.io/etcd/raft/quorum"
pb "go.etcd.io/etcd/raft/raftpb"
"go.etcd.io/etcd/raft/tracker"
)
// Changer facilitates configuration changes. It exposes methods to handle
// simple and joint consensus while performing the proper validation that allows
// refusing invalid configuration changes before they affect the active
// configuration.
type Changer struct {
Tracker tracker.ProgressTracker
LastIndex uint64
}
// EnterJoint verifies that the outgoing (=right) majority config of the joint
// config is empty and initializes it with a copy of the incoming (=left)
// majority config. That is, it transitions from
//
// (1 2 3)&&()
// to
// (1 2 3)&&(1 2 3).
//
// The supplied changes are then applied to the incoming majority config,
// resulting in a joint configuration that in terms of the Raft thesis[1]
// (Section 4.3) corresponds to `C_{new,old}`.
//
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
cfg, prs, err := c.checkAndCopy()
if err != nil {
return c.err(err)
}
if joint(cfg) {
err := errors.New("config is already joint")
return c.err(err)
}
if len(incoming(cfg.Voters)) == 0 {
// We allow adding nodes to an empty config for convenience (testing and
// bootstrap), but you can't enter a joint state.
err := errors.New("can't make a zero-voter config joint")
return c.err(err)
}
// Clear the outgoing config.
*outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
// Copy incoming to outgoing.
for id := range incoming(cfg.Voters) {
outgoing(cfg.Voters)[id] = struct{}{}
}
if err := c.apply(&cfg, prs, ccs...); err != nil {
return c.err(err)
}
cfg.AutoLeave = autoLeave
return checkAndReturn(cfg, prs)
}
// LeaveJoint transitions out of a joint configuration. It is an error to call
// this method if the configuration is not joint, i.e. if the outgoing majority
// config Voters[1] is empty.
//
// The outgoing majority config of the joint configuration will be removed,
// that is, the incoming config is promoted as the sole decision maker. In the
// notation of the Raft thesis[1] (Section 4.3), this method transitions from
// `C_{new,old}` into `C_new`.
//
// At the same time, any staged learners (LearnersNext) the addition of which
// was held back by an overlapping voter in the former outgoing config will be
// inserted into Learners.
//
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
cfg, prs, err := c.checkAndCopy()
if err != nil {
return c.err(err)
}
if !joint(cfg) {
err := errors.New("can't leave a non-joint config")
return c.err(err)
}
if len(outgoing(cfg.Voters)) == 0 {
err := fmt.Errorf("configuration is not joint: %v", cfg)
return c.err(err)
}
for id := range cfg.LearnersNext {
nilAwareAdd(&cfg.Learners, id)
prs[id].IsLearner = true
}
cfg.LearnersNext = nil
for id := range outgoing(cfg.Voters) {
_, isVoter := incoming(cfg.Voters)[id]
_, isLearner := cfg.Learners[id]
if !isVoter && !isLearner {
delete(prs, id)
}
}
*outgoingPtr(&cfg.Voters) = nil
cfg.AutoLeave = false
return checkAndReturn(cfg, prs)
}
// Simple carries out a series of configuration changes that (in aggregate)
// mutates the incoming majority config Voters[0] by at most one. This method
// will return an error if that is not the case, if the resulting quorum is
// zero, or if the configuration is in a joint state (i.e. if there is an
// outgoing configuration).
func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
cfg, prs, err := c.checkAndCopy()
if err != nil {
return c.err(err)
}
if joint(cfg) {
err := errors.New("can't apply simple config change in joint config")
return c.err(err)
}
if err := c.apply(&cfg, prs, ccs...); err != nil {
return c.err(err)
}
if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
}
if err := checkInvariants(cfg, prs); err != nil {
return tracker.Config{}, tracker.ProgressMap{}, nil
}
return checkAndReturn(cfg, prs)
}
// apply a change to the configuration. By convention, changes to voters are
// always made to the incoming majority config Voters[0]. Voters[1] is either
// empty or preserves the outgoing majority configuration while in a joint state.
func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
for _, cc := range ccs {
if cc.NodeID == 0 {
// etcd replaces the NodeID with zero if it decides (downstream of
// raft) to not apply a change, so we have to have explicit code
// here to ignore these.
continue
}
switch cc.Type {
case pb.ConfChangeAddNode:
c.makeVoter(cfg, prs, cc.NodeID)
case pb.ConfChangeAddLearnerNode:
c.makeLearner(cfg, prs, cc.NodeID)
case pb.ConfChangeRemoveNode:
c.remove(cfg, prs, cc.NodeID)
case pb.ConfChangeUpdateNode:
default:
return fmt.Errorf("unexpected conf type %d", cc.Type)
}
}
if len(incoming(cfg.Voters)) == 0 {
return errors.New("removed all voters")
}
return nil
}
// makeVoter adds or promotes the given ID to be a voter in the incoming
// majority config.
func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
pr := prs[id]
if pr == nil {
c.initProgress(cfg, prs, id, false /* isLearner */)
return
}
pr.IsLearner = false
nilAwareDelete(&cfg.Learners, id)
nilAwareDelete(&cfg.LearnersNext, id)
incoming(cfg.Voters)[id] = struct{}{}
return
}
// makeLearner makes the given ID a learner or stages it to be a learner once
// an active joint configuration is exited.
//
// The former happens when the peer is not a part of the outgoing config, in
// which case we either add a new learner or demote a voter in the incoming
// config.
//
// The latter case occurs when the configuration is joint and the peer is a
// voter in the outgoing config. In that case, we do not want to add the peer
// as a learner because then we'd have to track a peer as a voter and learner
// simultaneously. Instead, we add the learner to LearnersNext, so that it will
// be added to Learners the moment the outgoing config is removed by
// LeaveJoint().
func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
pr := prs[id]
if pr == nil {
c.initProgress(cfg, prs, id, true /* isLearner */)
return
}
if pr.IsLearner {
return
}
// Remove any existing voter in the incoming config...
c.remove(cfg, prs, id)
// ... but save the Progress.
prs[id] = pr
// Use LearnersNext if we can't add the learner to Learners directly, i.e.
// if the peer is still tracked as a voter in the outgoing config. It will
// be turned into a learner in LeaveJoint().
//
// Otherwise, add a regular learner right away.
if _, onRight := outgoing(cfg.Voters)[id]; onRight {
nilAwareAdd(&cfg.LearnersNext, id)
} else {
pr.IsLearner = true
nilAwareAdd(&cfg.Learners, id)
}
}
// remove this peer as a voter or learner from the incoming config.
func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
if _, ok := prs[id]; !ok {
return
}
delete(incoming(cfg.Voters), id)
nilAwareDelete(&cfg.Learners, id)
nilAwareDelete(&cfg.LearnersNext, id)
// If the peer is still a voter in the outgoing config, keep the Progress.
if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
delete(prs, id)
}
}
// initProgress initializes a new progress for the given node or learner.
func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
if !isLearner {
incoming(cfg.Voters)[id] = struct{}{}
} else {
nilAwareAdd(&cfg.Learners, id)
}
prs[id] = &tracker.Progress{
// Initializing the Progress with the last index means that the follower
// can be probed (with the last index).
//
// TODO(tbg): seems awfully optimistic. Using the first index would be
// better. The general expectation here is that the follower has no log
// at all (and will thus likely need a snapshot), though the app may
// have applied a snapshot out of band before adding the replica (thus
// making the first index the better choice).
Next: c.LastIndex,
Match: 0,
Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
IsLearner: isLearner,
// When a node is first added, we should mark it as recently active.
// Otherwise, CheckQuorum may cause us to step down if it is invoked
// before the added node has had a chance to communicate with us.
RecentActive: true,
}
}
// checkInvariants makes sure that the config and progress are compatible with
// each other. This is used to check both what the Changer is initialized with,
// as well as what it returns.
func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
// NB: intentionally allow the empty config. In production we'll never see a
// non-empty config (we prevent it from being created) but we will need to
// be able to *create* an initial config, for example during bootstrap (or
// during tests). Instead of having to hand-code this, we allow
// transitioning from an empty config into any other legal and non-empty
// config.
for _, ids := range []map[uint64]struct{}{
cfg.Voters.IDs(),
cfg.Learners,
cfg.LearnersNext,
} {
for id := range ids {
if _, ok := prs[id]; !ok {
return fmt.Errorf("no progress for %d", id)
}
}
}
// Any staged learner was staged because it could not be directly added due
// to a conflicting voter in the outgoing config.
for id := range cfg.LearnersNext {
if _, ok := outgoing(cfg.Voters)[id]; !ok {
return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
}
if prs[id].IsLearner {
return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
}
}
// Conversely Learners and Voters doesn't intersect at all.
for id := range cfg.Learners {
if _, ok := outgoing(cfg.Voters)[id]; ok {
return fmt.Errorf("%d is in Learners and Voters[1]", id)
}
if _, ok := incoming(cfg.Voters)[id]; ok {
return fmt.Errorf("%d is in Learners and Voters[0]", id)
}
if !prs[id].IsLearner {
return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
}
}
if !joint(cfg) {
// We enforce that empty maps are nil instead of zero.
if outgoing(cfg.Voters) != nil {
return fmt.Errorf("Voters[1] must be nil when not joint")
}
if cfg.LearnersNext != nil {
return fmt.Errorf("LearnersNext must be nil when not joint")
}
if cfg.AutoLeave {
return fmt.Errorf("AutoLeave must be false when not joint")
}
}
return nil
}
// checkAndCopy copies the tracker's config and progress map (deeply enough for
// the purposes of the Changer) and returns those copies. It returns an error
// if checkInvariants does.
func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
cfg := c.Tracker.Config.Clone()
prs := tracker.ProgressMap{}
for id, pr := range c.Tracker.Progress {
// A shallow copy is enough because we only mutate the Learner field.
ppr := *pr
prs[id] = &ppr
}
return checkAndReturn(cfg, prs)
}
// checkAndReturn calls checkInvariants on the input and returns either the
// resulting error or the input.
func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
if err := checkInvariants(cfg, prs); err != nil {
return tracker.Config{}, tracker.ProgressMap{}, err
}
return cfg, prs, nil
}
// err returns zero values and an error.
func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
return tracker.Config{}, nil, err
}
// nilAwareAdd populates a map entry, creating the map if necessary.
func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
if *m == nil {
*m = map[uint64]struct{}{}
}
(*m)[id] = struct{}{}
}
// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
if *m == nil {
return
}
delete(*m, id)
if len(*m) == 0 {
*m = nil
}
}
// symdiff returns the count of the symmetric difference between the sets of
// uint64s, i.e. len( (l - r) \union (r - l)).
func symdiff(l, r map[uint64]struct{}) int {
var n int
pairs := [][2]quorum.MajorityConfig{
{l, r}, // count elems in l but not in r
{r, l}, // count elems in r but not in l
}
for _, p := range pairs {
for id := range p[0] {
if _, ok := p[1][id]; !ok {
n++
}
}
}
return n
}
func joint(cfg tracker.Config) bool {
return len(outgoing(cfg.Voters)) > 0
}
func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
// Describe prints the type and NodeID of the configuration changes as a
// space-delimited string.
func Describe(ccs ...pb.ConfChangeSingle) string {
var buf strings.Builder
for _, cc := range ccs {
if buf.Len() > 0 {
buf.WriteByte(' ')
}
fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
}
return buf.String()
}

155
vendor/go.etcd.io/etcd/raft/confchange/restore.go generated vendored Normal file
View File

@ -0,0 +1,155 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package confchange
import (
pb "go.etcd.io/etcd/raft/raftpb"
"go.etcd.io/etcd/raft/tracker"
)
// toConfChangeSingle translates a conf state into 1) a slice of operations creating
// first the config that will become the outgoing one, and then the incoming one, and
// b) another slice that, when applied to the config resulted from 1), represents the
// ConfState.
func toConfChangeSingle(cs pb.ConfState) (out []pb.ConfChangeSingle, in []pb.ConfChangeSingle) {
// Example to follow along this code:
// voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4)
//
// This means that before entering the joint config, the configuration
// had voters (1 2 4) and perhaps some learners that are already gone.
// The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6)
// are no longer voters; however 4 is poised to become a learner upon leaving
// the joint state.
// We can't tell whether 5 was a learner before entering the joint config,
// but it doesn't matter (we'll pretend that it wasn't).
//
// The code below will construct
// outgoing = add 1; add 2; add 4; add 6
// incoming = remove 1; remove 2; remove 4; remove 6
// add 1; add 2; add 3;
// add-learner 5;
// add-learner 4;
//
// So, when starting with an empty config, after applying 'outgoing' we have
//
// quorum=(1 2 4 6)
//
// From which we enter a joint state via 'incoming'
//
// quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4)
//
// as desired.
for _, id := range cs.VotersOutgoing {
// If there are outgoing voters, first add them one by one so that the
// (non-joint) config has them all.
out = append(out, pb.ConfChangeSingle{
Type: pb.ConfChangeAddNode,
NodeID: id,
})
}
// We're done constructing the outgoing slice, now on to the incoming one
// (which will apply on top of the config created by the outgoing slice).
// First, we'll remove all of the outgoing voters.
for _, id := range cs.VotersOutgoing {
in = append(in, pb.ConfChangeSingle{
Type: pb.ConfChangeRemoveNode,
NodeID: id,
})
}
// Then we'll add the incoming voters and learners.
for _, id := range cs.Voters {
in = append(in, pb.ConfChangeSingle{
Type: pb.ConfChangeAddNode,
NodeID: id,
})
}
for _, id := range cs.Learners {
in = append(in, pb.ConfChangeSingle{
Type: pb.ConfChangeAddLearnerNode,
NodeID: id,
})
}
// Same for LearnersNext; these are nodes we want to be learners but which
// are currently voters in the outgoing config.
for _, id := range cs.LearnersNext {
in = append(in, pb.ConfChangeSingle{
Type: pb.ConfChangeAddLearnerNode,
NodeID: id,
})
}
return out, in
}
func chain(chg Changer, ops ...func(Changer) (tracker.Config, tracker.ProgressMap, error)) (tracker.Config, tracker.ProgressMap, error) {
for _, op := range ops {
cfg, prs, err := op(chg)
if err != nil {
return tracker.Config{}, nil, err
}
chg.Tracker.Config = cfg
chg.Tracker.Progress = prs
}
return chg.Tracker.Config, chg.Tracker.Progress, nil
}
// Restore takes a Changer (which must represent an empty configuration), and
// runs a sequence of changes enacting the configuration described in the
// ConfState.
//
// TODO(tbg) it's silly that this takes a Changer. Unravel this by making sure
// the Changer only needs a ProgressMap (not a whole Tracker) at which point
// this can just take LastIndex and MaxInflight directly instead and cook up
// the results from that alone.
func Restore(chg Changer, cs pb.ConfState) (tracker.Config, tracker.ProgressMap, error) {
outgoing, incoming := toConfChangeSingle(cs)
var ops []func(Changer) (tracker.Config, tracker.ProgressMap, error)
if len(outgoing) == 0 {
// No outgoing config, so just apply the incoming changes one by one.
for _, cc := range incoming {
cc := cc // loop-local copy
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
return chg.Simple(cc)
})
}
} else {
// The ConfState describes a joint configuration.
//
// First, apply all of the changes of the outgoing config one by one, so
// that it temporarily becomes the incoming active config. For example,
// if the config is (1 2 3)&(2 3 4), this will establish (2 3 4)&().
for _, cc := range outgoing {
cc := cc // loop-local copy
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
return chg.Simple(cc)
})
}
// Now enter the joint state, which rotates the above additions into the
// outgoing config, and adds the incoming config in. Continuing the
// example above, we'd get (1 2 3)&(2 3 4), i.e. the incoming operations
// would be removing 2,3,4 and then adding in 1,2,3 while transitioning
// into a joint state.
ops = append(ops, func(chg Changer) (tracker.Config, tracker.ProgressMap, error) {
return chg.EnterJoint(cs.AutoLeave, incoming...)
})
}
return chain(chg, ops...)
}

300
vendor/go.etcd.io/etcd/raft/doc.go generated vendored Normal file
View File

@ -0,0 +1,300 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
Package raft sends and receives messages in the Protocol Buffer format
defined in the raftpb package.
Raft is a protocol with which a cluster of nodes can maintain a replicated state machine.
The state machine is kept in sync through the use of a replicated log.
For more details on Raft, see "In Search of an Understandable Consensus Algorithm"
(https://raft.github.io/raft.pdf) by Diego Ongaro and John Ousterhout.
A simple example application, _raftexample_, is also available to help illustrate
how to use this package in practice:
https://github.com/etcd-io/etcd/tree/master/contrib/raftexample
Usage
The primary object in raft is a Node. You either start a Node from scratch
using raft.StartNode or start a Node from some initial state using raft.RestartNode.
To start a node from scratch:
storage := raft.NewMemoryStorage()
c := &Config{
ID: 0x01,
ElectionTick: 10,
HeartbeatTick: 1,
Storage: storage,
MaxSizePerMsg: 4096,
MaxInflightMsgs: 256,
}
n := raft.StartNode(c, []raft.Peer{{ID: 0x02}, {ID: 0x03}})
To restart a node from previous state:
storage := raft.NewMemoryStorage()
// recover the in-memory storage from persistent
// snapshot, state and entries.
storage.ApplySnapshot(snapshot)
storage.SetHardState(state)
storage.Append(entries)
c := &Config{
ID: 0x01,
ElectionTick: 10,
HeartbeatTick: 1,
Storage: storage,
MaxSizePerMsg: 4096,
MaxInflightMsgs: 256,
}
// restart raft without peer information.
// peer information is already included in the storage.
n := raft.RestartNode(c)
Now that you are holding onto a Node you have a few responsibilities:
First, you must read from the Node.Ready() channel and process the updates
it contains. These steps may be performed in parallel, except as noted in step
2.
1. Write HardState, Entries, and Snapshot to persistent storage if they are
not empty. Note that when writing an Entry with Index i, any
previously-persisted entries with Index >= i must be discarded.
2. Send all Messages to the nodes named in the To field. It is important that
no messages be sent until the latest HardState has been persisted to disk,
and all Entries written by any previous Ready batch (Messages may be sent while
entries from the same batch are being persisted). To reduce the I/O latency, an
optimization can be applied to make leader write to disk in parallel with its
followers (as explained at section 10.2.1 in Raft thesis). If any Message has type
MsgSnap, call Node.ReportSnapshot() after it has been sent (these messages may be
large).
Note: Marshalling messages is not thread-safe; it is important that you
make sure that no new entries are persisted while marshalling.
The easiest way to achieve this is to serialize the messages directly inside
your main raft loop.
3. Apply Snapshot (if any) and CommittedEntries to the state machine.
If any committed Entry has Type EntryConfChange, call Node.ApplyConfChange()
to apply it to the node. The configuration change may be cancelled at this point
by setting the NodeID field to zero before calling ApplyConfChange
(but ApplyConfChange must be called one way or the other, and the decision to cancel
must be based solely on the state machine and not external information such as
the observed health of the node).
4. Call Node.Advance() to signal readiness for the next batch of updates.
This may be done at any time after step 1, although all updates must be processed
in the order they were returned by Ready.
Second, all persisted log entries must be made available via an
implementation of the Storage interface. The provided MemoryStorage
type can be used for this (if you repopulate its state upon a
restart), or you can supply your own disk-backed implementation.
Third, when you receive a message from another node, pass it to Node.Step:
func recvRaftRPC(ctx context.Context, m raftpb.Message) {
n.Step(ctx, m)
}
Finally, you need to call Node.Tick() at regular intervals (probably
via a time.Ticker). Raft has two important timeouts: heartbeat and the
election timeout. However, internally to the raft package time is
represented by an abstract "tick".
The total state machine handling loop will look something like this:
for {
select {
case <-s.Ticker:
n.Tick()
case rd := <-s.Node.Ready():
saveToStorage(rd.State, rd.Entries, rd.Snapshot)
send(rd.Messages)
if !raft.IsEmptySnap(rd.Snapshot) {
processSnapshot(rd.Snapshot)
}
for _, entry := range rd.CommittedEntries {
process(entry)
if entry.Type == raftpb.EntryConfChange {
var cc raftpb.ConfChange
cc.Unmarshal(entry.Data)
s.Node.ApplyConfChange(cc)
}
}
s.Node.Advance()
case <-s.done:
return
}
}
To propose changes to the state machine from your node take your application
data, serialize it into a byte slice and call:
n.Propose(ctx, data)
If the proposal is committed, data will appear in committed entries with type
raftpb.EntryNormal. There is no guarantee that a proposed command will be
committed; you may have to re-propose after a timeout.
To add or remove a node in a cluster, build ConfChange struct 'cc' and call:
n.ProposeConfChange(ctx, cc)
After config change is committed, some committed entry with type
raftpb.EntryConfChange will be returned. You must apply it to node through:
var cc raftpb.ConfChange
cc.Unmarshal(data)
n.ApplyConfChange(cc)
Note: An ID represents a unique node in a cluster for all time. A
given ID MUST be used only once even if the old node has been removed.
This means that for example IP addresses make poor node IDs since they
may be reused. Node IDs must be non-zero.
Implementation notes
This implementation is up to date with the final Raft thesis
(https://github.com/ongardie/dissertation/blob/master/stanford.pdf), although our
implementation of the membership change protocol differs somewhat from
that described in chapter 4. The key invariant that membership changes
happen one node at a time is preserved, but in our implementation the
membership change takes effect when its entry is applied, not when it
is added to the log (so the entry is committed under the old
membership instead of the new). This is equivalent in terms of safety,
since the old and new configurations are guaranteed to overlap.
To ensure that we do not attempt to commit two membership changes at
once by matching log positions (which would be unsafe since they
should have different quorum requirements), we simply disallow any
proposed membership change while any uncommitted change appears in
the leader's log.
This approach introduces a problem when you try to remove a member
from a two-member cluster: If one of the members dies before the
other one receives the commit of the confchange entry, then the member
cannot be removed any more since the cluster cannot make progress.
For this reason it is highly recommended to use three or more nodes in
every cluster.
MessageType
Package raft sends and receives message in Protocol Buffer format (defined
in raftpb package). Each state (follower, candidate, leader) implements its
own 'step' method ('stepFollower', 'stepCandidate', 'stepLeader') when
advancing with the given raftpb.Message. Each step is determined by its
raftpb.MessageType. Note that every step is checked by one common method
'Step' that safety-checks the terms of node and incoming message to prevent
stale log entries:
'MsgHup' is used for election. If a node is a follower or candidate, the
'tick' function in 'raft' struct is set as 'tickElection'. If a follower or
candidate has not received any heartbeat before the election timeout, it
passes 'MsgHup' to its Step method and becomes (or remains) a candidate to
start a new election.
'MsgBeat' is an internal type that signals the leader to send a heartbeat of
the 'MsgHeartbeat' type. If a node is a leader, the 'tick' function in
the 'raft' struct is set as 'tickHeartbeat', and triggers the leader to
send periodic 'MsgHeartbeat' messages to its followers.
'MsgProp' proposes to append data to its log entries. This is a special
type to redirect proposals to leader. Therefore, send method overwrites
raftpb.Message's term with its HardState's term to avoid attaching its
local term to 'MsgProp'. When 'MsgProp' is passed to the leader's 'Step'
method, the leader first calls the 'appendEntry' method to append entries
to its log, and then calls 'bcastAppend' method to send those entries to
its peers. When passed to candidate, 'MsgProp' is dropped. When passed to
follower, 'MsgProp' is stored in follower's mailbox(msgs) by the send
method. It is stored with sender's ID and later forwarded to leader by
rafthttp package.
'MsgApp' contains log entries to replicate. A leader calls bcastAppend,
which calls sendAppend, which sends soon-to-be-replicated logs in 'MsgApp'
type. When 'MsgApp' is passed to candidate's Step method, candidate reverts
back to follower, because it indicates that there is a valid leader sending
'MsgApp' messages. Candidate and follower respond to this message in
'MsgAppResp' type.
'MsgAppResp' is response to log replication request('MsgApp'). When
'MsgApp' is passed to candidate or follower's Step method, it responds by
calling 'handleAppendEntries' method, which sends 'MsgAppResp' to raft
mailbox.
'MsgVote' requests votes for election. When a node is a follower or
candidate and 'MsgHup' is passed to its Step method, then the node calls
'campaign' method to campaign itself to become a leader. Once 'campaign'
method is called, the node becomes candidate and sends 'MsgVote' to peers
in cluster to request votes. When passed to leader or candidate's Step
method and the message's Term is lower than leader's or candidate's,
'MsgVote' will be rejected ('MsgVoteResp' is returned with Reject true).
If leader or candidate receives 'MsgVote' with higher term, it will revert
back to follower. When 'MsgVote' is passed to follower, it votes for the
sender only when sender's last term is greater than MsgVote's term or
sender's last term is equal to MsgVote's term but sender's last committed
index is greater than or equal to follower's.
'MsgVoteResp' contains responses from voting request. When 'MsgVoteResp' is
passed to candidate, the candidate calculates how many votes it has won. If
it's more than majority (quorum), it becomes leader and calls 'bcastAppend'.
If candidate receives majority of votes of denials, it reverts back to
follower.
'MsgPreVote' and 'MsgPreVoteResp' are used in an optional two-phase election
protocol. When Config.PreVote is true, a pre-election is carried out first
(using the same rules as a regular election), and no node increases its term
number unless the pre-election indicates that the campaigning node would win.
This minimizes disruption when a partitioned node rejoins the cluster.
'MsgSnap' requests to install a snapshot message. When a node has just
become a leader or the leader receives 'MsgProp' message, it calls
'bcastAppend' method, which then calls 'sendAppend' method to each
follower. In 'sendAppend', if a leader fails to get term or entries,
the leader requests snapshot by sending 'MsgSnap' type message.
'MsgSnapStatus' tells the result of snapshot install message. When a
follower rejected 'MsgSnap', it indicates the snapshot request with
'MsgSnap' had failed from network issues which causes the network layer
to fail to send out snapshots to its followers. Then leader considers
follower's progress as probe. When 'MsgSnap' were not rejected, it
indicates that the snapshot succeeded and the leader sets follower's
progress to probe and resumes its log replication.
'MsgHeartbeat' sends heartbeat from leader. When 'MsgHeartbeat' is passed
to candidate and message's term is higher than candidate's, the candidate
reverts back to follower and updates its committed index from the one in
this heartbeat. And it sends the message to its mailbox. When
'MsgHeartbeat' is passed to follower's Step method and message's term is
higher than follower's, the follower updates its leaderID with the ID
from the message.
'MsgHeartbeatResp' is a response to 'MsgHeartbeat'. When 'MsgHeartbeatResp'
is passed to leader's Step method, the leader knows which follower
responded. And only when the leader's last committed index is greater than
follower's Match index, the leader runs 'sendAppend` method.
'MsgUnreachable' tells that request(message) wasn't delivered. When
'MsgUnreachable' is passed to leader's Step method, the leader discovers
that the follower that sent this 'MsgUnreachable' is not reachable, often
indicating 'MsgApp' is lost. When follower's progress state is replicate,
the leader sets it back to probe.
*/
package raft

372
vendor/go.etcd.io/etcd/raft/log.go generated vendored Normal file
View File

@ -0,0 +1,372 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"log"
pb "go.etcd.io/etcd/raft/raftpb"
)
type raftLog struct {
// storage contains all stable entries since the last snapshot.
storage Storage
// unstable contains all unstable entries and snapshot.
// they will be saved into storage.
unstable unstable
// committed is the highest log position that is known to be in
// stable storage on a quorum of nodes.
committed uint64
// applied is the highest log position that the application has
// been instructed to apply to its state machine.
// Invariant: applied <= committed
applied uint64
logger Logger
// maxNextEntsSize is the maximum number aggregate byte size of the messages
// returned from calls to nextEnts.
maxNextEntsSize uint64
}
// newLog returns log using the given storage and default options. It
// recovers the log to the state that it just commits and applies the
// latest snapshot.
func newLog(storage Storage, logger Logger) *raftLog {
return newLogWithSize(storage, logger, noLimit)
}
// newLogWithSize returns a log using the given storage and max
// message size.
func newLogWithSize(storage Storage, logger Logger, maxNextEntsSize uint64) *raftLog {
if storage == nil {
log.Panic("storage must not be nil")
}
log := &raftLog{
storage: storage,
logger: logger,
maxNextEntsSize: maxNextEntsSize,
}
firstIndex, err := storage.FirstIndex()
if err != nil {
panic(err) // TODO(bdarnell)
}
lastIndex, err := storage.LastIndex()
if err != nil {
panic(err) // TODO(bdarnell)
}
log.unstable.offset = lastIndex + 1
log.unstable.logger = logger
// Initialize our committed and applied pointers to the time of the last compaction.
log.committed = firstIndex - 1
log.applied = firstIndex - 1
return log
}
func (l *raftLog) String() string {
return fmt.Sprintf("committed=%d, applied=%d, unstable.offset=%d, len(unstable.Entries)=%d", l.committed, l.applied, l.unstable.offset, len(l.unstable.entries))
}
// maybeAppend returns (0, false) if the entries cannot be appended. Otherwise,
// it returns (last index of new entries, true).
func (l *raftLog) maybeAppend(index, logTerm, committed uint64, ents ...pb.Entry) (lastnewi uint64, ok bool) {
if l.matchTerm(index, logTerm) {
lastnewi = index + uint64(len(ents))
ci := l.findConflict(ents)
switch {
case ci == 0:
case ci <= l.committed:
l.logger.Panicf("entry %d conflict with committed entry [committed(%d)]", ci, l.committed)
default:
offset := index + 1
l.append(ents[ci-offset:]...)
}
l.commitTo(min(committed, lastnewi))
return lastnewi, true
}
return 0, false
}
func (l *raftLog) append(ents ...pb.Entry) uint64 {
if len(ents) == 0 {
return l.lastIndex()
}
if after := ents[0].Index - 1; after < l.committed {
l.logger.Panicf("after(%d) is out of range [committed(%d)]", after, l.committed)
}
l.unstable.truncateAndAppend(ents)
return l.lastIndex()
}
// findConflict finds the index of the conflict.
// It returns the first pair of conflicting entries between the existing
// entries and the given entries, if there are any.
// If there is no conflicting entries, and the existing entries contains
// all the given entries, zero will be returned.
// If there is no conflicting entries, but the given entries contains new
// entries, the index of the first new entry will be returned.
// An entry is considered to be conflicting if it has the same index but
// a different term.
// The first entry MUST have an index equal to the argument 'from'.
// The index of the given entries MUST be continuously increasing.
func (l *raftLog) findConflict(ents []pb.Entry) uint64 {
for _, ne := range ents {
if !l.matchTerm(ne.Index, ne.Term) {
if ne.Index <= l.lastIndex() {
l.logger.Infof("found conflict at index %d [existing term: %d, conflicting term: %d]",
ne.Index, l.zeroTermOnErrCompacted(l.term(ne.Index)), ne.Term)
}
return ne.Index
}
}
return 0
}
func (l *raftLog) unstableEntries() []pb.Entry {
if len(l.unstable.entries) == 0 {
return nil
}
return l.unstable.entries
}
// nextEnts returns all the available entries for execution.
// If applied is smaller than the index of snapshot, it returns all committed
// entries after the index of snapshot.
func (l *raftLog) nextEnts() (ents []pb.Entry) {
off := max(l.applied+1, l.firstIndex())
if l.committed+1 > off {
ents, err := l.slice(off, l.committed+1, l.maxNextEntsSize)
if err != nil {
l.logger.Panicf("unexpected error when getting unapplied entries (%v)", err)
}
return ents
}
return nil
}
// hasNextEnts returns if there is any available entries for execution. This
// is a fast check without heavy raftLog.slice() in raftLog.nextEnts().
func (l *raftLog) hasNextEnts() bool {
off := max(l.applied+1, l.firstIndex())
return l.committed+1 > off
}
func (l *raftLog) snapshot() (pb.Snapshot, error) {
if l.unstable.snapshot != nil {
return *l.unstable.snapshot, nil
}
return l.storage.Snapshot()
}
func (l *raftLog) firstIndex() uint64 {
if i, ok := l.unstable.maybeFirstIndex(); ok {
return i
}
index, err := l.storage.FirstIndex()
if err != nil {
panic(err) // TODO(bdarnell)
}
return index
}
func (l *raftLog) lastIndex() uint64 {
if i, ok := l.unstable.maybeLastIndex(); ok {
return i
}
i, err := l.storage.LastIndex()
if err != nil {
panic(err) // TODO(bdarnell)
}
return i
}
func (l *raftLog) commitTo(tocommit uint64) {
// never decrease commit
if l.committed < tocommit {
if l.lastIndex() < tocommit {
l.logger.Panicf("tocommit(%d) is out of range [lastIndex(%d)]. Was the raft log corrupted, truncated, or lost?", tocommit, l.lastIndex())
}
l.committed = tocommit
}
}
func (l *raftLog) appliedTo(i uint64) {
if i == 0 {
return
}
if l.committed < i || i < l.applied {
l.logger.Panicf("applied(%d) is out of range [prevApplied(%d), committed(%d)]", i, l.applied, l.committed)
}
l.applied = i
}
func (l *raftLog) stableTo(i, t uint64) { l.unstable.stableTo(i, t) }
func (l *raftLog) stableSnapTo(i uint64) { l.unstable.stableSnapTo(i) }
func (l *raftLog) lastTerm() uint64 {
t, err := l.term(l.lastIndex())
if err != nil {
l.logger.Panicf("unexpected error when getting the last term (%v)", err)
}
return t
}
func (l *raftLog) term(i uint64) (uint64, error) {
// the valid term range is [index of dummy entry, last index]
dummyIndex := l.firstIndex() - 1
if i < dummyIndex || i > l.lastIndex() {
// TODO: return an error instead?
return 0, nil
}
if t, ok := l.unstable.maybeTerm(i); ok {
return t, nil
}
t, err := l.storage.Term(i)
if err == nil {
return t, nil
}
if err == ErrCompacted || err == ErrUnavailable {
return 0, err
}
panic(err) // TODO(bdarnell)
}
func (l *raftLog) entries(i, maxsize uint64) ([]pb.Entry, error) {
if i > l.lastIndex() {
return nil, nil
}
return l.slice(i, l.lastIndex()+1, maxsize)
}
// allEntries returns all entries in the log.
func (l *raftLog) allEntries() []pb.Entry {
ents, err := l.entries(l.firstIndex(), noLimit)
if err == nil {
return ents
}
if err == ErrCompacted { // try again if there was a racing compaction
return l.allEntries()
}
// TODO (xiangli): handle error?
panic(err)
}
// isUpToDate determines if the given (lastIndex,term) log is more up-to-date
// by comparing the index and term of the last entries in the existing logs.
// If the logs have last entries with different terms, then the log with the
// later term is more up-to-date. If the logs end with the same term, then
// whichever log has the larger lastIndex is more up-to-date. If the logs are
// the same, the given log is up-to-date.
func (l *raftLog) isUpToDate(lasti, term uint64) bool {
return term > l.lastTerm() || (term == l.lastTerm() && lasti >= l.lastIndex())
}
func (l *raftLog) matchTerm(i, term uint64) bool {
t, err := l.term(i)
if err != nil {
return false
}
return t == term
}
func (l *raftLog) maybeCommit(maxIndex, term uint64) bool {
if maxIndex > l.committed && l.zeroTermOnErrCompacted(l.term(maxIndex)) == term {
l.commitTo(maxIndex)
return true
}
return false
}
func (l *raftLog) restore(s pb.Snapshot) {
l.logger.Infof("log [%s] starts to restore snapshot [index: %d, term: %d]", l, s.Metadata.Index, s.Metadata.Term)
l.committed = s.Metadata.Index
l.unstable.restore(s)
}
// slice returns a slice of log entries from lo through hi-1, inclusive.
func (l *raftLog) slice(lo, hi, maxSize uint64) ([]pb.Entry, error) {
err := l.mustCheckOutOfBounds(lo, hi)
if err != nil {
return nil, err
}
if lo == hi {
return nil, nil
}
var ents []pb.Entry
if lo < l.unstable.offset {
storedEnts, err := l.storage.Entries(lo, min(hi, l.unstable.offset), maxSize)
if err == ErrCompacted {
return nil, err
} else if err == ErrUnavailable {
l.logger.Panicf("entries[%d:%d) is unavailable from storage", lo, min(hi, l.unstable.offset))
} else if err != nil {
panic(err) // TODO(bdarnell)
}
// check if ents has reached the size limitation
if uint64(len(storedEnts)) < min(hi, l.unstable.offset)-lo {
return storedEnts, nil
}
ents = storedEnts
}
if hi > l.unstable.offset {
unstable := l.unstable.slice(max(lo, l.unstable.offset), hi)
if len(ents) > 0 {
combined := make([]pb.Entry, len(ents)+len(unstable))
n := copy(combined, ents)
copy(combined[n:], unstable)
ents = combined
} else {
ents = unstable
}
}
return limitSize(ents, maxSize), nil
}
// l.firstIndex <= lo <= hi <= l.firstIndex + len(l.entries)
func (l *raftLog) mustCheckOutOfBounds(lo, hi uint64) error {
if lo > hi {
l.logger.Panicf("invalid slice %d > %d", lo, hi)
}
fi := l.firstIndex()
if lo < fi {
return ErrCompacted
}
length := l.lastIndex() + 1 - fi
if lo < fi || hi > fi+length {
l.logger.Panicf("slice[%d,%d) out of bound [%d,%d]", lo, hi, fi, l.lastIndex())
}
return nil
}
func (l *raftLog) zeroTermOnErrCompacted(t uint64, err error) uint64 {
if err == nil {
return t
}
if err == ErrCompacted {
return 0
}
l.logger.Panicf("unexpected error (%v)", err)
return 0
}

157
vendor/go.etcd.io/etcd/raft/log_unstable.go generated vendored Normal file
View File

@ -0,0 +1,157 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import pb "go.etcd.io/etcd/raft/raftpb"
// unstable.entries[i] has raft log position i+unstable.offset.
// Note that unstable.offset may be less than the highest log
// position in storage; this means that the next write to storage
// might need to truncate the log before persisting unstable.entries.
type unstable struct {
// the incoming unstable snapshot, if any.
snapshot *pb.Snapshot
// all entries that have not yet been written to storage.
entries []pb.Entry
offset uint64
logger Logger
}
// maybeFirstIndex returns the index of the first possible entry in entries
// if it has a snapshot.
func (u *unstable) maybeFirstIndex() (uint64, bool) {
if u.snapshot != nil {
return u.snapshot.Metadata.Index + 1, true
}
return 0, false
}
// maybeLastIndex returns the last index if it has at least one
// unstable entry or snapshot.
func (u *unstable) maybeLastIndex() (uint64, bool) {
if l := len(u.entries); l != 0 {
return u.offset + uint64(l) - 1, true
}
if u.snapshot != nil {
return u.snapshot.Metadata.Index, true
}
return 0, false
}
// maybeTerm returns the term of the entry at index i, if there
// is any.
func (u *unstable) maybeTerm(i uint64) (uint64, bool) {
if i < u.offset {
if u.snapshot != nil && u.snapshot.Metadata.Index == i {
return u.snapshot.Metadata.Term, true
}
return 0, false
}
last, ok := u.maybeLastIndex()
if !ok {
return 0, false
}
if i > last {
return 0, false
}
return u.entries[i-u.offset].Term, true
}
func (u *unstable) stableTo(i, t uint64) {
gt, ok := u.maybeTerm(i)
if !ok {
return
}
// if i < offset, term is matched with the snapshot
// only update the unstable entries if term is matched with
// an unstable entry.
if gt == t && i >= u.offset {
u.entries = u.entries[i+1-u.offset:]
u.offset = i + 1
u.shrinkEntriesArray()
}
}
// shrinkEntriesArray discards the underlying array used by the entries slice
// if most of it isn't being used. This avoids holding references to a bunch of
// potentially large entries that aren't needed anymore. Simply clearing the
// entries wouldn't be safe because clients might still be using them.
func (u *unstable) shrinkEntriesArray() {
// We replace the array if we're using less than half of the space in
// it. This number is fairly arbitrary, chosen as an attempt to balance
// memory usage vs number of allocations. It could probably be improved
// with some focused tuning.
const lenMultiple = 2
if len(u.entries) == 0 {
u.entries = nil
} else if len(u.entries)*lenMultiple < cap(u.entries) {
newEntries := make([]pb.Entry, len(u.entries))
copy(newEntries, u.entries)
u.entries = newEntries
}
}
func (u *unstable) stableSnapTo(i uint64) {
if u.snapshot != nil && u.snapshot.Metadata.Index == i {
u.snapshot = nil
}
}
func (u *unstable) restore(s pb.Snapshot) {
u.offset = s.Metadata.Index + 1
u.entries = nil
u.snapshot = &s
}
func (u *unstable) truncateAndAppend(ents []pb.Entry) {
after := ents[0].Index
switch {
case after == u.offset+uint64(len(u.entries)):
// after is the next index in the u.entries
// directly append
u.entries = append(u.entries, ents...)
case after <= u.offset:
u.logger.Infof("replace the unstable entries from index %d", after)
// The log is being truncated to before our current offset
// portion, so set the offset and replace the entries
u.offset = after
u.entries = ents
default:
// truncate to after and copy to u.entries
// then append
u.logger.Infof("truncate the unstable entries before index %d", after)
u.entries = append([]pb.Entry{}, u.slice(u.offset, after)...)
u.entries = append(u.entries, ents...)
}
}
func (u *unstable) slice(lo uint64, hi uint64) []pb.Entry {
u.mustCheckOutOfBounds(lo, hi)
return u.entries[lo-u.offset : hi-u.offset]
}
// u.offset <= lo <= hi <= u.offset+len(u.entries)
func (u *unstable) mustCheckOutOfBounds(lo, hi uint64) {
if lo > hi {
u.logger.Panicf("invalid unstable.slice %d > %d", lo, hi)
}
upper := u.offset + uint64(len(u.entries))
if lo < u.offset || hi > upper {
u.logger.Panicf("unstable.slice[%d,%d) out of bound [%d,%d]", lo, hi, u.offset, upper)
}
}

132
vendor/go.etcd.io/etcd/raft/logger.go generated vendored Normal file
View File

@ -0,0 +1,132 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"io/ioutil"
"log"
"os"
"sync"
)
type Logger interface {
Debug(v ...interface{})
Debugf(format string, v ...interface{})
Error(v ...interface{})
Errorf(format string, v ...interface{})
Info(v ...interface{})
Infof(format string, v ...interface{})
Warning(v ...interface{})
Warningf(format string, v ...interface{})
Fatal(v ...interface{})
Fatalf(format string, v ...interface{})
Panic(v ...interface{})
Panicf(format string, v ...interface{})
}
func SetLogger(l Logger) {
raftLoggerMu.Lock()
raftLogger = l
raftLoggerMu.Unlock()
}
var (
defaultLogger = &DefaultLogger{Logger: log.New(os.Stderr, "raft", log.LstdFlags)}
discardLogger = &DefaultLogger{Logger: log.New(ioutil.Discard, "", 0)}
raftLoggerMu sync.Mutex
raftLogger = Logger(defaultLogger)
)
const (
calldepth = 2
)
// DefaultLogger is a default implementation of the Logger interface.
type DefaultLogger struct {
*log.Logger
debug bool
}
func (l *DefaultLogger) EnableTimestamps() {
l.SetFlags(l.Flags() | log.Ldate | log.Ltime)
}
func (l *DefaultLogger) EnableDebug() {
l.debug = true
}
func (l *DefaultLogger) Debug(v ...interface{}) {
if l.debug {
l.Output(calldepth, header("DEBUG", fmt.Sprint(v...)))
}
}
func (l *DefaultLogger) Debugf(format string, v ...interface{}) {
if l.debug {
l.Output(calldepth, header("DEBUG", fmt.Sprintf(format, v...)))
}
}
func (l *DefaultLogger) Info(v ...interface{}) {
l.Output(calldepth, header("INFO", fmt.Sprint(v...)))
}
func (l *DefaultLogger) Infof(format string, v ...interface{}) {
l.Output(calldepth, header("INFO", fmt.Sprintf(format, v...)))
}
func (l *DefaultLogger) Error(v ...interface{}) {
l.Output(calldepth, header("ERROR", fmt.Sprint(v...)))
}
func (l *DefaultLogger) Errorf(format string, v ...interface{}) {
l.Output(calldepth, header("ERROR", fmt.Sprintf(format, v...)))
}
func (l *DefaultLogger) Warning(v ...interface{}) {
l.Output(calldepth, header("WARN", fmt.Sprint(v...)))
}
func (l *DefaultLogger) Warningf(format string, v ...interface{}) {
l.Output(calldepth, header("WARN", fmt.Sprintf(format, v...)))
}
func (l *DefaultLogger) Fatal(v ...interface{}) {
l.Output(calldepth, header("FATAL", fmt.Sprint(v...)))
os.Exit(1)
}
func (l *DefaultLogger) Fatalf(format string, v ...interface{}) {
l.Output(calldepth, header("FATAL", fmt.Sprintf(format, v...)))
os.Exit(1)
}
func (l *DefaultLogger) Panic(v ...interface{}) {
l.Logger.Panic(v...)
}
func (l *DefaultLogger) Panicf(format string, v ...interface{}) {
l.Logger.Panicf(format, v...)
}
func header(lvl, msg string) string {
return fmt.Sprintf("%s: %s", lvl, msg)
}

584
vendor/go.etcd.io/etcd/raft/node.go generated vendored Normal file
View File

@ -0,0 +1,584 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"context"
"errors"
pb "go.etcd.io/etcd/raft/raftpb"
)
type SnapshotStatus int
const (
SnapshotFinish SnapshotStatus = 1
SnapshotFailure SnapshotStatus = 2
)
var (
emptyState = pb.HardState{}
// ErrStopped is returned by methods on Nodes that have been stopped.
ErrStopped = errors.New("raft: stopped")
)
// SoftState provides state that is useful for logging and debugging.
// The state is volatile and does not need to be persisted to the WAL.
type SoftState struct {
Lead uint64 // must use atomic operations to access; keep 64-bit aligned.
RaftState StateType
}
func (a *SoftState) equal(b *SoftState) bool {
return a.Lead == b.Lead && a.RaftState == b.RaftState
}
// Ready encapsulates the entries and messages that are ready to read,
// be saved to stable storage, committed or sent to other peers.
// All fields in Ready are read-only.
type Ready struct {
// The current volatile state of a Node.
// SoftState will be nil if there is no update.
// It is not required to consume or store SoftState.
*SoftState
// The current state of a Node to be saved to stable storage BEFORE
// Messages are sent.
// HardState will be equal to empty state if there is no update.
pb.HardState
// ReadStates can be used for node to serve linearizable read requests locally
// when its applied index is greater than the index in ReadState.
// Note that the readState will be returned when raft receives msgReadIndex.
// The returned is only valid for the request that requested to read.
ReadStates []ReadState
// Entries specifies entries to be saved to stable storage BEFORE
// Messages are sent.
Entries []pb.Entry
// Snapshot specifies the snapshot to be saved to stable storage.
Snapshot pb.Snapshot
// CommittedEntries specifies entries to be committed to a
// store/state-machine. These have previously been committed to stable
// store.
CommittedEntries []pb.Entry
// Messages specifies outbound messages to be sent AFTER Entries are
// committed to stable storage.
// If it contains a MsgSnap message, the application MUST report back to raft
// when the snapshot has been received or has failed by calling ReportSnapshot.
Messages []pb.Message
// MustSync indicates whether the HardState and Entries must be synchronously
// written to disk or if an asynchronous write is permissible.
MustSync bool
}
func isHardStateEqual(a, b pb.HardState) bool {
return a.Term == b.Term && a.Vote == b.Vote && a.Commit == b.Commit
}
// IsEmptyHardState returns true if the given HardState is empty.
func IsEmptyHardState(st pb.HardState) bool {
return isHardStateEqual(st, emptyState)
}
// IsEmptySnap returns true if the given Snapshot is empty.
func IsEmptySnap(sp pb.Snapshot) bool {
return sp.Metadata.Index == 0
}
func (rd Ready) containsUpdates() bool {
return rd.SoftState != nil || !IsEmptyHardState(rd.HardState) ||
!IsEmptySnap(rd.Snapshot) || len(rd.Entries) > 0 ||
len(rd.CommittedEntries) > 0 || len(rd.Messages) > 0 || len(rd.ReadStates) != 0
}
// appliedCursor extracts from the Ready the highest index the client has
// applied (once the Ready is confirmed via Advance). If no information is
// contained in the Ready, returns zero.
func (rd Ready) appliedCursor() uint64 {
if n := len(rd.CommittedEntries); n > 0 {
return rd.CommittedEntries[n-1].Index
}
if index := rd.Snapshot.Metadata.Index; index > 0 {
return index
}
return 0
}
// Node represents a node in a raft cluster.
type Node interface {
// Tick increments the internal logical clock for the Node by a single tick. Election
// timeouts and heartbeat timeouts are in units of ticks.
Tick()
// Campaign causes the Node to transition to candidate state and start campaigning to become leader.
Campaign(ctx context.Context) error
// Propose proposes that data be appended to the log. Note that proposals can be lost without
// notice, therefore it is user's job to ensure proposal retries.
Propose(ctx context.Context, data []byte) error
// ProposeConfChange proposes a configuration change. Like any proposal, the
// configuration change may be dropped with or without an error being
// returned. In particular, configuration changes are dropped unless the
// leader has certainty that there is no prior unapplied configuration
// change in its log.
//
// The method accepts either a pb.ConfChange (deprecated) or pb.ConfChangeV2
// message. The latter allows arbitrary configuration changes via joint
// consensus, notably including replacing a voter. Passing a ConfChangeV2
// message is only allowed if all Nodes participating in the cluster run a
// version of this library aware of the V2 API. See pb.ConfChangeV2 for
// usage details and semantics.
ProposeConfChange(ctx context.Context, cc pb.ConfChangeI) error
// Step advances the state machine using the given message. ctx.Err() will be returned, if any.
Step(ctx context.Context, msg pb.Message) error
// Ready returns a channel that returns the current point-in-time state.
// Users of the Node must call Advance after retrieving the state returned by Ready.
//
// NOTE: No committed entries from the next Ready may be applied until all committed entries
// and snapshots from the previous one have finished.
Ready() <-chan Ready
// Advance notifies the Node that the application has saved progress up to the last Ready.
// It prepares the node to return the next available Ready.
//
// The application should generally call Advance after it applies the entries in last Ready.
//
// However, as an optimization, the application may call Advance while it is applying the
// commands. For example. when the last Ready contains a snapshot, the application might take
// a long time to apply the snapshot data. To continue receiving Ready without blocking raft
// progress, it can call Advance before finishing applying the last ready.
Advance()
// ApplyConfChange applies a config change (previously passed to
// ProposeConfChange) to the node. This must be called whenever a config
// change is observed in Ready.CommittedEntries.
//
// Returns an opaque non-nil ConfState protobuf which must be recorded in
// snapshots.
ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState
// TransferLeadership attempts to transfer leadership to the given transferee.
TransferLeadership(ctx context.Context, lead, transferee uint64)
// ReadIndex request a read state. The read state will be set in the ready.
// Read state has a read index. Once the application advances further than the read
// index, any linearizable read requests issued before the read request can be
// processed safely. The read state will have the same rctx attached.
ReadIndex(ctx context.Context, rctx []byte) error
// Status returns the current status of the raft state machine.
Status() Status
// ReportUnreachable reports the given node is not reachable for the last send.
ReportUnreachable(id uint64)
// ReportSnapshot reports the status of the sent snapshot. The id is the raft ID of the follower
// who is meant to receive the snapshot, and the status is SnapshotFinish or SnapshotFailure.
// Calling ReportSnapshot with SnapshotFinish is a no-op. But, any failure in applying a
// snapshot (for e.g., while streaming it from leader to follower), should be reported to the
// leader with SnapshotFailure. When leader sends a snapshot to a follower, it pauses any raft
// log probes until the follower can apply the snapshot and advance its state. If the follower
// can't do that, for e.g., due to a crash, it could end up in a limbo, never getting any
// updates from the leader. Therefore, it is crucial that the application ensures that any
// failure in snapshot sending is caught and reported back to the leader; so it can resume raft
// log probing in the follower.
ReportSnapshot(id uint64, status SnapshotStatus)
// Stop performs any necessary termination of the Node.
Stop()
}
type Peer struct {
ID uint64
Context []byte
}
// StartNode returns a new Node given configuration and a list of raft peers.
// It appends a ConfChangeAddNode entry for each given peer to the initial log.
//
// Peers must not be zero length; call RestartNode in that case.
func StartNode(c *Config, peers []Peer) Node {
if len(peers) == 0 {
panic("no peers given; use RestartNode instead")
}
rn, err := NewRawNode(c)
if err != nil {
panic(err)
}
rn.Bootstrap(peers)
n := newNode(rn)
go n.run()
return &n
}
// RestartNode is similar to StartNode but does not take a list of peers.
// The current membership of the cluster will be restored from the Storage.
// If the caller has an existing state machine, pass in the last log index that
// has been applied to it; otherwise use zero.
func RestartNode(c *Config) Node {
rn, err := NewRawNode(c)
if err != nil {
panic(err)
}
n := newNode(rn)
go n.run()
return &n
}
type msgWithResult struct {
m pb.Message
result chan error
}
// node is the canonical implementation of the Node interface
type node struct {
propc chan msgWithResult
recvc chan pb.Message
confc chan pb.ConfChangeV2
confstatec chan pb.ConfState
readyc chan Ready
advancec chan struct{}
tickc chan struct{}
done chan struct{}
stop chan struct{}
status chan chan Status
rn *RawNode
}
func newNode(rn *RawNode) node {
return node{
propc: make(chan msgWithResult),
recvc: make(chan pb.Message),
confc: make(chan pb.ConfChangeV2),
confstatec: make(chan pb.ConfState),
readyc: make(chan Ready),
advancec: make(chan struct{}),
// make tickc a buffered chan, so raft node can buffer some ticks when the node
// is busy processing raft messages. Raft node will resume process buffered
// ticks when it becomes idle.
tickc: make(chan struct{}, 128),
done: make(chan struct{}),
stop: make(chan struct{}),
status: make(chan chan Status),
rn: rn,
}
}
func (n *node) Stop() {
select {
case n.stop <- struct{}{}:
// Not already stopped, so trigger it
case <-n.done:
// Node has already been stopped - no need to do anything
return
}
// Block until the stop has been acknowledged by run()
<-n.done
}
func (n *node) run() {
var propc chan msgWithResult
var readyc chan Ready
var advancec chan struct{}
var rd Ready
r := n.rn.raft
lead := None
for {
if advancec != nil {
readyc = nil
} else if n.rn.HasReady() {
// Populate a Ready. Note that this Ready is not guaranteed to
// actually be handled. We will arm readyc, but there's no guarantee
// that we will actually send on it. It's possible that we will
// service another channel instead, loop around, and then populate
// the Ready again. We could instead force the previous Ready to be
// handled first, but it's generally good to emit larger Readys plus
// it simplifies testing (by emitting less frequently and more
// predictably).
rd = n.rn.readyWithoutAccept()
readyc = n.readyc
}
if lead != r.lead {
if r.hasLeader() {
if lead == None {
r.logger.Infof("raft.node: %x elected leader %x at term %d", r.id, r.lead, r.Term)
} else {
r.logger.Infof("raft.node: %x changed leader from %x to %x at term %d", r.id, lead, r.lead, r.Term)
}
propc = n.propc
} else {
r.logger.Infof("raft.node: %x lost leader %x at term %d", r.id, lead, r.Term)
propc = nil
}
lead = r.lead
}
select {
// TODO: maybe buffer the config propose if there exists one (the way
// described in raft dissertation)
// Currently it is dropped in Step silently.
case pm := <-propc:
m := pm.m
m.From = r.id
err := r.Step(m)
if pm.result != nil {
pm.result <- err
close(pm.result)
}
case m := <-n.recvc:
// filter out response message from unknown From.
if pr := r.prs.Progress[m.From]; pr != nil || !IsResponseMsg(m.Type) {
r.Step(m)
}
case cc := <-n.confc:
_, okBefore := r.prs.Progress[r.id]
cs := r.applyConfChange(cc)
// If the node was removed, block incoming proposals. Note that we
// only do this if the node was in the config before. Nodes may be
// a member of the group without knowing this (when they're catching
// up on the log and don't have the latest config) and we don't want
// to block the proposal channel in that case.
//
// NB: propc is reset when the leader changes, which, if we learn
// about it, sort of implies that we got readded, maybe? This isn't
// very sound and likely has bugs.
if _, okAfter := r.prs.Progress[r.id]; okBefore && !okAfter {
var found bool
for _, sl := range [][]uint64{cs.Voters, cs.VotersOutgoing} {
for _, id := range sl {
if id == r.id {
found = true
}
}
}
if !found {
propc = nil
}
}
select {
case n.confstatec <- cs:
case <-n.done:
}
case <-n.tickc:
n.rn.Tick()
case readyc <- rd:
n.rn.acceptReady(rd)
advancec = n.advancec
case <-advancec:
n.rn.Advance(rd)
rd = Ready{}
advancec = nil
case c := <-n.status:
c <- getStatus(r)
case <-n.stop:
close(n.done)
return
}
}
}
// Tick increments the internal logical clock for this Node. Election timeouts
// and heartbeat timeouts are in units of ticks.
func (n *node) Tick() {
select {
case n.tickc <- struct{}{}:
case <-n.done:
default:
n.rn.raft.logger.Warningf("%x (leader %v) A tick missed to fire. Node blocks too long!", n.rn.raft.id, n.rn.raft.id == n.rn.raft.lead)
}
}
func (n *node) Campaign(ctx context.Context) error { return n.step(ctx, pb.Message{Type: pb.MsgHup}) }
func (n *node) Propose(ctx context.Context, data []byte) error {
return n.stepWait(ctx, pb.Message{Type: pb.MsgProp, Entries: []pb.Entry{{Data: data}}})
}
func (n *node) Step(ctx context.Context, m pb.Message) error {
// ignore unexpected local messages receiving over network
if IsLocalMsg(m.Type) {
// TODO: return an error?
return nil
}
return n.step(ctx, m)
}
func confChangeToMsg(c pb.ConfChangeI) (pb.Message, error) {
typ, data, err := pb.MarshalConfChange(c)
if err != nil {
return pb.Message{}, err
}
return pb.Message{Type: pb.MsgProp, Entries: []pb.Entry{{Type: typ, Data: data}}}, nil
}
func (n *node) ProposeConfChange(ctx context.Context, cc pb.ConfChangeI) error {
msg, err := confChangeToMsg(cc)
if err != nil {
return err
}
return n.Step(ctx, msg)
}
func (n *node) step(ctx context.Context, m pb.Message) error {
return n.stepWithWaitOption(ctx, m, false)
}
func (n *node) stepWait(ctx context.Context, m pb.Message) error {
return n.stepWithWaitOption(ctx, m, true)
}
// Step advances the state machine using msgs. The ctx.Err() will be returned,
// if any.
func (n *node) stepWithWaitOption(ctx context.Context, m pb.Message, wait bool) error {
if m.Type != pb.MsgProp {
select {
case n.recvc <- m:
return nil
case <-ctx.Done():
return ctx.Err()
case <-n.done:
return ErrStopped
}
}
ch := n.propc
pm := msgWithResult{m: m}
if wait {
pm.result = make(chan error, 1)
}
select {
case ch <- pm:
if !wait {
return nil
}
case <-ctx.Done():
return ctx.Err()
case <-n.done:
return ErrStopped
}
select {
case err := <-pm.result:
if err != nil {
return err
}
case <-ctx.Done():
return ctx.Err()
case <-n.done:
return ErrStopped
}
return nil
}
func (n *node) Ready() <-chan Ready { return n.readyc }
func (n *node) Advance() {
select {
case n.advancec <- struct{}{}:
case <-n.done:
}
}
func (n *node) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
var cs pb.ConfState
select {
case n.confc <- cc.AsV2():
case <-n.done:
}
select {
case cs = <-n.confstatec:
case <-n.done:
}
return &cs
}
func (n *node) Status() Status {
c := make(chan Status)
select {
case n.status <- c:
return <-c
case <-n.done:
return Status{}
}
}
func (n *node) ReportUnreachable(id uint64) {
select {
case n.recvc <- pb.Message{Type: pb.MsgUnreachable, From: id}:
case <-n.done:
}
}
func (n *node) ReportSnapshot(id uint64, status SnapshotStatus) {
rej := status == SnapshotFailure
select {
case n.recvc <- pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej}:
case <-n.done:
}
}
func (n *node) TransferLeadership(ctx context.Context, lead, transferee uint64) {
select {
// manually set 'from' and 'to', so that leader can voluntarily transfers its leadership
case n.recvc <- pb.Message{Type: pb.MsgTransferLeader, From: transferee, To: lead}:
case <-n.done:
case <-ctx.Done():
}
}
func (n *node) ReadIndex(ctx context.Context, rctx []byte) error {
return n.step(ctx, pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
}
func newReady(r *raft, prevSoftSt *SoftState, prevHardSt pb.HardState) Ready {
rd := Ready{
Entries: r.raftLog.unstableEntries(),
CommittedEntries: r.raftLog.nextEnts(),
Messages: r.msgs,
}
if softSt := r.softState(); !softSt.equal(prevSoftSt) {
rd.SoftState = softSt
}
if hardSt := r.hardState(); !isHardStateEqual(hardSt, prevHardSt) {
rd.HardState = hardSt
}
if r.raftLog.unstable.snapshot != nil {
rd.Snapshot = *r.raftLog.unstable.snapshot
}
if len(r.readStates) != 0 {
rd.ReadStates = r.readStates
}
rd.MustSync = MustSync(r.hardState(), prevHardSt, len(rd.Entries))
return rd
}
// MustSync returns true if the hard state and count of Raft entries indicate
// that a synchronous write to persistent storage is required.
func MustSync(st, prevst pb.HardState, entsnum int) bool {
// Persistent state on all servers:
// (Updated on stable storage before responding to RPCs)
// currentTerm
// votedFor
// log entries[]
return entsnum != 0 || st.Vote != prevst.Vote || st.Term != prevst.Term
}

75
vendor/go.etcd.io/etcd/raft/quorum/joint.go generated vendored Normal file
View File

@ -0,0 +1,75 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package quorum
// JointConfig is a configuration of two groups of (possibly overlapping)
// majority configurations. Decisions require the support of both majorities.
type JointConfig [2]MajorityConfig
func (c JointConfig) String() string {
if len(c[1]) > 0 {
return c[0].String() + "&&" + c[1].String()
}
return c[0].String()
}
// IDs returns a newly initialized map representing the set of voters present
// in the joint configuration.
func (c JointConfig) IDs() map[uint64]struct{} {
m := map[uint64]struct{}{}
for _, cc := range c {
for id := range cc {
m[id] = struct{}{}
}
}
return m
}
// Describe returns a (multi-line) representation of the commit indexes for the
// given lookuper.
func (c JointConfig) Describe(l AckedIndexer) string {
return MajorityConfig(c.IDs()).Describe(l)
}
// CommittedIndex returns the largest committed index for the given joint
// quorum. An index is jointly committed if it is committed in both constituent
// majorities.
func (c JointConfig) CommittedIndex(l AckedIndexer) Index {
idx0 := c[0].CommittedIndex(l)
idx1 := c[1].CommittedIndex(l)
if idx0 < idx1 {
return idx0
}
return idx1
}
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
// a result indicating whether the vote is pending, lost, or won. A joint quorum
// requires both majority quorums to vote in favor.
func (c JointConfig) VoteResult(votes map[uint64]bool) VoteResult {
r1 := c[0].VoteResult(votes)
r2 := c[1].VoteResult(votes)
if r1 == r2 {
// If they agree, return the agreed state.
return r1
}
if r1 == VoteLost || r2 == VoteLost {
// If either config has lost, loss is the only possible outcome.
return VoteLost
}
// One side won, the other one is pending, so the whole outcome is.
return VotePending
}

210
vendor/go.etcd.io/etcd/raft/quorum/majority.go generated vendored Normal file
View File

@ -0,0 +1,210 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package quorum
import (
"fmt"
"math"
"sort"
"strings"
)
// MajorityConfig is a set of IDs that uses majority quorums to make decisions.
type MajorityConfig map[uint64]struct{}
func (c MajorityConfig) String() string {
sl := make([]uint64, 0, len(c))
for id := range c {
sl = append(sl, id)
}
sort.Slice(sl, func(i, j int) bool { return sl[i] < sl[j] })
var buf strings.Builder
buf.WriteByte('(')
for i := range sl {
if i > 0 {
buf.WriteByte(' ')
}
fmt.Fprint(&buf, sl[i])
}
buf.WriteByte(')')
return buf.String()
}
// Describe returns a (multi-line) representation of the commit indexes for the
// given lookuper.
func (c MajorityConfig) Describe(l AckedIndexer) string {
if len(c) == 0 {
return "<empty majority quorum>"
}
type tup struct {
id uint64
idx Index
ok bool // idx found?
bar int // length of bar displayed for this tup
}
// Below, populate .bar so that the i-th largest commit index has bar i (we
// plot this as sort of a progress bar). The actual code is a bit more
// complicated and also makes sure that equal index => equal bar.
n := len(c)
info := make([]tup, 0, n)
for id := range c {
idx, ok := l.AckedIndex(id)
info = append(info, tup{id: id, idx: idx, ok: ok})
}
// Sort by index
sort.Slice(info, func(i, j int) bool {
if info[i].idx == info[j].idx {
return info[i].id < info[j].id
}
return info[i].idx < info[j].idx
})
// Populate .bar.
for i := range info {
if i > 0 && info[i-1].idx < info[i].idx {
info[i].bar = i
}
}
// Sort by ID.
sort.Slice(info, func(i, j int) bool {
return info[i].id < info[j].id
})
var buf strings.Builder
// Print.
fmt.Fprint(&buf, strings.Repeat(" ", n)+" idx\n")
for i := range info {
bar := info[i].bar
if !info[i].ok {
fmt.Fprint(&buf, "?"+strings.Repeat(" ", n))
} else {
fmt.Fprint(&buf, strings.Repeat("x", bar)+">"+strings.Repeat(" ", n-bar))
}
fmt.Fprintf(&buf, " %5d (id=%d)\n", info[i].idx, info[i].id)
}
return buf.String()
}
// Slice returns the MajorityConfig as a sorted slice.
func (c MajorityConfig) Slice() []uint64 {
var sl []uint64
for id := range c {
sl = append(sl, id)
}
sort.Slice(sl, func(i, j int) bool { return sl[i] < sl[j] })
return sl
}
func insertionSort(sl []uint64) {
a, b := 0, len(sl)
for i := a + 1; i < b; i++ {
for j := i; j > a && sl[j] < sl[j-1]; j-- {
sl[j], sl[j-1] = sl[j-1], sl[j]
}
}
}
// CommittedIndex computes the committed index from those supplied via the
// provided AckedIndexer (for the active config).
func (c MajorityConfig) CommittedIndex(l AckedIndexer) Index {
n := len(c)
if n == 0 {
// This plays well with joint quorums which, when one half is the zero
// MajorityConfig, should behave like the other half.
return math.MaxUint64
}
// Use an on-stack slice to collect the committed indexes when n <= 7
// (otherwise we alloc). The alternative is to stash a slice on
// MajorityConfig, but this impairs usability (as is, MajorityConfig is just
// a map, and that's nice). The assumption is that running with a
// replication factor of >7 is rare, and in cases in which it happens
// performance is a lesser concern (additionally the performance
// implications of an allocation here are far from drastic).
var stk [7]uint64
var srt []uint64
if len(stk) >= n {
srt = stk[:n]
} else {
srt = make([]uint64, n)
}
{
// Fill the slice with the indexes observed. Any unused slots will be
// left as zero; these correspond to voters that may report in, but
// haven't yet. We fill from the right (since the zeroes will end up on
// the left after sorting below anyway).
i := n - 1
for id := range c {
if idx, ok := l.AckedIndex(id); ok {
srt[i] = uint64(idx)
i--
}
}
}
// Sort by index. Use a bespoke algorithm (copied from the stdlib's sort
// package) to keep srt on the stack.
insertionSort(srt)
// The smallest index into the array for which the value is acked by a
// quorum. In other words, from the end of the slice, move n/2+1 to the
// left (accounting for zero-indexing).
pos := n - (n/2 + 1)
return Index(srt[pos])
}
// VoteResult takes a mapping of voters to yes/no (true/false) votes and returns
// a result indicating whether the vote is pending (i.e. neither a quorum of
// yes/no has been reached), won (a quorum of yes has been reached), or lost (a
// quorum of no has been reached).
func (c MajorityConfig) VoteResult(votes map[uint64]bool) VoteResult {
if len(c) == 0 {
// By convention, the elections on an empty config win. This comes in
// handy with joint quorums because it'll make a half-populated joint
// quorum behave like a majority quorum.
return VoteWon
}
ny := [2]int{} // vote counts for no and yes, respectively
var missing int
for id := range c {
v, ok := votes[id]
if !ok {
missing++
continue
}
if v {
ny[1]++
} else {
ny[0]++
}
}
q := len(c)/2 + 1
if ny[1] >= q {
return VoteWon
}
if ny[1]+missing >= q {
return VotePending
}
return VoteLost
}

58
vendor/go.etcd.io/etcd/raft/quorum/quorum.go generated vendored Normal file
View File

@ -0,0 +1,58 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package quorum
import (
"math"
"strconv"
)
// Index is a Raft log position.
type Index uint64
func (i Index) String() string {
if i == math.MaxUint64 {
return "∞"
}
return strconv.FormatUint(uint64(i), 10)
}
// AckedIndexer allows looking up a commit index for a given ID of a voter
// from a corresponding MajorityConfig.
type AckedIndexer interface {
AckedIndex(voterID uint64) (idx Index, found bool)
}
type mapAckIndexer map[uint64]Index
func (m mapAckIndexer) AckedIndex(id uint64) (Index, bool) {
idx, ok := m[id]
return idx, ok
}
// VoteResult indicates the outcome of a vote.
//
//go:generate stringer -type=VoteResult
type VoteResult uint8
const (
// VotePending indicates that the decision of the vote depends on future
// votes, i.e. neither "yes" or "no" has reached quorum yet.
VotePending VoteResult = 1 + iota
// VoteLost indicates that the quorum has voted "no".
VoteLost
// VoteWon indicates that the quorum has voted "yes".
VoteWon
)

View File

@ -0,0 +1,26 @@
// Code generated by "stringer -type=VoteResult"; DO NOT EDIT.
package quorum
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[VotePending-1]
_ = x[VoteLost-2]
_ = x[VoteWon-3]
}
const _VoteResult_name = "VotePendingVoteLostVoteWon"
var _VoteResult_index = [...]uint8{0, 11, 19, 26}
func (i VoteResult) String() string {
i -= 1
if i >= VoteResult(len(_VoteResult_index)-1) {
return "VoteResult(" + strconv.FormatInt(int64(i+1), 10) + ")"
}
return _VoteResult_name[_VoteResult_index[i]:_VoteResult_index[i+1]]
}

1656
vendor/go.etcd.io/etcd/raft/raft.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

170
vendor/go.etcd.io/etcd/raft/raftpb/confchange.go generated vendored Normal file
View File

@ -0,0 +1,170 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raftpb
import (
"fmt"
"strconv"
"strings"
"github.com/gogo/protobuf/proto"
)
// ConfChangeI abstracts over ConfChangeV2 and (legacy) ConfChange to allow
// treating them in a unified manner.
type ConfChangeI interface {
AsV2() ConfChangeV2
AsV1() (ConfChange, bool)
}
// MarshalConfChange calls Marshal on the underlying ConfChange or ConfChangeV2
// and returns the result along with the corresponding EntryType.
func MarshalConfChange(c ConfChangeI) (EntryType, []byte, error) {
var typ EntryType
var ccdata []byte
var err error
if ccv1, ok := c.AsV1(); ok {
typ = EntryConfChange
ccdata, err = ccv1.Marshal()
} else {
ccv2 := c.AsV2()
typ = EntryConfChangeV2
ccdata, err = ccv2.Marshal()
}
return typ, ccdata, err
}
// AsV2 returns a V2 configuration change carrying out the same operation.
func (c ConfChange) AsV2() ConfChangeV2 {
return ConfChangeV2{
Changes: []ConfChangeSingle{{
Type: c.Type,
NodeID: c.NodeID,
}},
Context: c.Context,
}
}
// AsV1 returns the ConfChange and true.
func (c ConfChange) AsV1() (ConfChange, bool) {
return c, true
}
// AsV2 is the identity.
func (c ConfChangeV2) AsV2() ConfChangeV2 { return c }
// AsV1 returns ConfChange{} and false.
func (c ConfChangeV2) AsV1() (ConfChange, bool) { return ConfChange{}, false }
// EnterJoint returns two bools. The second bool is true if and only if this
// config change will use Joint Consensus, which is the case if it contains more
// than one change or if the use of Joint Consensus was requested explicitly.
// The first bool can only be true if second one is, and indicates whether the
// Joint State will be left automatically.
func (c *ConfChangeV2) EnterJoint() (autoLeave bool, ok bool) {
// NB: in theory, more config changes could qualify for the "simple"
// protocol but it depends on the config on top of which the changes apply.
// For example, adding two learners is not OK if both nodes are part of the
// base config (i.e. two voters are turned into learners in the process of
// applying the conf change). In practice, these distinctions should not
// matter, so we keep it simple and use Joint Consensus liberally.
if c.Transition != ConfChangeTransitionAuto || len(c.Changes) > 1 {
// Use Joint Consensus.
var autoLeave bool
switch c.Transition {
case ConfChangeTransitionAuto:
autoLeave = true
case ConfChangeTransitionJointImplicit:
autoLeave = true
case ConfChangeTransitionJointExplicit:
default:
panic(fmt.Sprintf("unknown transition: %+v", c))
}
return autoLeave, true
}
return false, false
}
// LeaveJoint is true if the configuration change leaves a joint configuration.
// This is the case if the ConfChangeV2 is zero, with the possible exception of
// the Context field.
func (c *ConfChangeV2) LeaveJoint() bool {
cpy := *c
cpy.Context = nil
return proto.Equal(&cpy, &ConfChangeV2{})
}
// ConfChangesFromString parses a Space-delimited sequence of operations into a
// slice of ConfChangeSingle. The supported operations are:
// - vn: make n a voter,
// - ln: make n a learner,
// - rn: remove n, and
// - un: update n.
func ConfChangesFromString(s string) ([]ConfChangeSingle, error) {
var ccs []ConfChangeSingle
toks := strings.Split(strings.TrimSpace(s), " ")
if toks[0] == "" {
toks = nil
}
for _, tok := range toks {
if len(tok) < 2 {
return nil, fmt.Errorf("unknown token %s", tok)
}
var cc ConfChangeSingle
switch tok[0] {
case 'v':
cc.Type = ConfChangeAddNode
case 'l':
cc.Type = ConfChangeAddLearnerNode
case 'r':
cc.Type = ConfChangeRemoveNode
case 'u':
cc.Type = ConfChangeUpdateNode
default:
return nil, fmt.Errorf("unknown input: %s", tok)
}
id, err := strconv.ParseUint(tok[1:], 10, 64)
if err != nil {
return nil, err
}
cc.NodeID = id
ccs = append(ccs, cc)
}
return ccs, nil
}
// ConfChangesToString is the inverse to ConfChangesFromString.
func ConfChangesToString(ccs []ConfChangeSingle) string {
var buf strings.Builder
for i, cc := range ccs {
if i > 0 {
buf.WriteByte(' ')
}
switch cc.Type {
case ConfChangeAddNode:
buf.WriteByte('v')
case ConfChangeAddLearnerNode:
buf.WriteByte('l')
case ConfChangeRemoveNode:
buf.WriteByte('r')
case ConfChangeUpdateNode:
buf.WriteByte('u')
default:
buf.WriteString("unknown")
}
fmt.Fprintf(&buf, "%d", cc.NodeID)
}
return buf.String()
}

45
vendor/go.etcd.io/etcd/raft/raftpb/confstate.go generated vendored Normal file
View File

@ -0,0 +1,45 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raftpb
import (
"fmt"
"reflect"
"sort"
)
// Equivalent returns a nil error if the inputs describe the same configuration.
// On mismatch, returns a descriptive error showing the differences.
func (cs ConfState) Equivalent(cs2 ConfState) error {
cs1 := cs
orig1, orig2 := cs1, cs2
s := func(sl *[]uint64) {
*sl = append([]uint64(nil), *sl...)
sort.Slice(*sl, func(i, j int) bool { return (*sl)[i] < (*sl)[j] })
}
for _, cs := range []*ConfState{&cs1, &cs2} {
s(&cs.Voters)
s(&cs.Learners)
s(&cs.VotersOutgoing)
s(&cs.LearnersNext)
cs.XXX_unrecognized = nil
}
if !reflect.DeepEqual(cs1, cs2) {
return fmt.Errorf("ConfStates not equivalent after sorting:\n%+#v\n%+#v\nInputs were:\n%+#v\n%+#v", cs1, cs2, orig1, orig2)
}
return nil
}

2646
vendor/go.etcd.io/etcd/raft/raftpb/raft.pb.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

239
vendor/go.etcd.io/etcd/raft/rawnode.go generated vendored Normal file
View File

@ -0,0 +1,239 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
pb "go.etcd.io/etcd/raft/raftpb"
"go.etcd.io/etcd/raft/tracker"
)
// ErrStepLocalMsg is returned when try to step a local raft message
var ErrStepLocalMsg = errors.New("raft: cannot step raft local message")
// ErrStepPeerNotFound is returned when try to step a response message
// but there is no peer found in raft.prs for that node.
var ErrStepPeerNotFound = errors.New("raft: cannot step as peer not found")
// RawNode is a thread-unsafe Node.
// The methods of this struct correspond to the methods of Node and are described
// more fully there.
type RawNode struct {
raft *raft
prevSoftSt *SoftState
prevHardSt pb.HardState
}
// NewRawNode instantiates a RawNode from the given configuration.
//
// See Bootstrap() for bootstrapping an initial state; this replaces the former
// 'peers' argument to this method (with identical behavior). However, It is
// recommended that instead of calling Bootstrap, applications bootstrap their
// state manually by setting up a Storage that has a first index > 1 and which
// stores the desired ConfState as its InitialState.
func NewRawNode(config *Config) (*RawNode, error) {
r := newRaft(config)
rn := &RawNode{
raft: r,
}
rn.prevSoftSt = r.softState()
rn.prevHardSt = r.hardState()
return rn, nil
}
// Tick advances the internal logical clock by a single tick.
func (rn *RawNode) Tick() {
rn.raft.tick()
}
// TickQuiesced advances the internal logical clock by a single tick without
// performing any other state machine processing. It allows the caller to avoid
// periodic heartbeats and elections when all of the peers in a Raft group are
// known to be at the same state. Expected usage is to periodically invoke Tick
// or TickQuiesced depending on whether the group is "active" or "quiesced".
//
// WARNING: Be very careful about using this method as it subverts the Raft
// state machine. You should probably be using Tick instead.
func (rn *RawNode) TickQuiesced() {
rn.raft.electionElapsed++
}
// Campaign causes this RawNode to transition to candidate state.
func (rn *RawNode) Campaign() error {
return rn.raft.Step(pb.Message{
Type: pb.MsgHup,
})
}
// Propose proposes data be appended to the raft log.
func (rn *RawNode) Propose(data []byte) error {
return rn.raft.Step(pb.Message{
Type: pb.MsgProp,
From: rn.raft.id,
Entries: []pb.Entry{
{Data: data},
}})
}
// ProposeConfChange proposes a config change. See (Node).ProposeConfChange for
// details.
func (rn *RawNode) ProposeConfChange(cc pb.ConfChangeI) error {
m, err := confChangeToMsg(cc)
if err != nil {
return err
}
return rn.raft.Step(m)
}
// ApplyConfChange applies a config change to the local node.
func (rn *RawNode) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
cs := rn.raft.applyConfChange(cc.AsV2())
return &cs
}
// Step advances the state machine using the given message.
func (rn *RawNode) Step(m pb.Message) error {
// ignore unexpected local messages receiving over network
if IsLocalMsg(m.Type) {
return ErrStepLocalMsg
}
if pr := rn.raft.prs.Progress[m.From]; pr != nil || !IsResponseMsg(m.Type) {
return rn.raft.Step(m)
}
return ErrStepPeerNotFound
}
// Ready returns the outstanding work that the application needs to handle. This
// includes appending and applying entries or a snapshot, updating the HardState,
// and sending messages. The returned Ready() *must* be handled and subsequently
// passed back via Advance().
func (rn *RawNode) Ready() Ready {
rd := rn.readyWithoutAccept()
rn.acceptReady(rd)
return rd
}
// readyWithoutAccept returns a Ready. This is a read-only operation, i.e. there
// is no obligation that the Ready must be handled.
func (rn *RawNode) readyWithoutAccept() Ready {
return newReady(rn.raft, rn.prevSoftSt, rn.prevHardSt)
}
// acceptReady is called when the consumer of the RawNode has decided to go
// ahead and handle a Ready. Nothing must alter the state of the RawNode between
// this call and the prior call to Ready().
func (rn *RawNode) acceptReady(rd Ready) {
if rd.SoftState != nil {
rn.prevSoftSt = rd.SoftState
}
if len(rd.ReadStates) != 0 {
rn.raft.readStates = nil
}
rn.raft.msgs = nil
}
// HasReady called when RawNode user need to check if any Ready pending.
// Checking logic in this method should be consistent with Ready.containsUpdates().
func (rn *RawNode) HasReady() bool {
r := rn.raft
if !r.softState().equal(rn.prevSoftSt) {
return true
}
if hardSt := r.hardState(); !IsEmptyHardState(hardSt) && !isHardStateEqual(hardSt, rn.prevHardSt) {
return true
}
if r.raftLog.unstable.snapshot != nil && !IsEmptySnap(*r.raftLog.unstable.snapshot) {
return true
}
if len(r.msgs) > 0 || len(r.raftLog.unstableEntries()) > 0 || r.raftLog.hasNextEnts() {
return true
}
if len(r.readStates) != 0 {
return true
}
return false
}
// Advance notifies the RawNode that the application has applied and saved progress in the
// last Ready results.
func (rn *RawNode) Advance(rd Ready) {
if !IsEmptyHardState(rd.HardState) {
rn.prevHardSt = rd.HardState
}
rn.raft.advance(rd)
}
// Status returns the current status of the given group. This allocates, see
// BasicStatus and WithProgress for allocation-friendlier choices.
func (rn *RawNode) Status() Status {
status := getStatus(rn.raft)
return status
}
// BasicStatus returns a BasicStatus. Notably this does not contain the
// Progress map; see WithProgress for an allocation-free way to inspect it.
func (rn *RawNode) BasicStatus() BasicStatus {
return getBasicStatus(rn.raft)
}
// ProgressType indicates the type of replica a Progress corresponds to.
type ProgressType byte
const (
// ProgressTypePeer accompanies a Progress for a regular peer replica.
ProgressTypePeer ProgressType = iota
// ProgressTypeLearner accompanies a Progress for a learner replica.
ProgressTypeLearner
)
// WithProgress is a helper to introspect the Progress for this node and its
// peers.
func (rn *RawNode) WithProgress(visitor func(id uint64, typ ProgressType, pr tracker.Progress)) {
rn.raft.prs.Visit(func(id uint64, pr *tracker.Progress) {
typ := ProgressTypePeer
if pr.IsLearner {
typ = ProgressTypeLearner
}
p := *pr
p.Inflights = nil
visitor(id, typ, p)
})
}
// ReportUnreachable reports the given node is not reachable for the last send.
func (rn *RawNode) ReportUnreachable(id uint64) {
_ = rn.raft.Step(pb.Message{Type: pb.MsgUnreachable, From: id})
}
// ReportSnapshot reports the status of the sent snapshot.
func (rn *RawNode) ReportSnapshot(id uint64, status SnapshotStatus) {
rej := status == SnapshotFailure
_ = rn.raft.Step(pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej})
}
// TransferLeader tries to transfer leadership to the given transferee.
func (rn *RawNode) TransferLeader(transferee uint64) {
_ = rn.raft.Step(pb.Message{Type: pb.MsgTransferLeader, From: transferee})
}
// ReadIndex requests a read state. The read state will be set in ready.
// Read State has a read index. Once the application advances further than the read
// index, any linearizable read requests issued before the read request can be
// processed safely. The read state will have the same rctx attached.
func (rn *RawNode) ReadIndex(rctx []byte) {
_ = rn.raft.Step(pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
}

121
vendor/go.etcd.io/etcd/raft/read_only.go generated vendored Normal file
View File

@ -0,0 +1,121 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import pb "go.etcd.io/etcd/raft/raftpb"
// ReadState provides state for read only query.
// It's caller's responsibility to call ReadIndex first before getting
// this state from ready, it's also caller's duty to differentiate if this
// state is what it requests through RequestCtx, eg. given a unique id as
// RequestCtx
type ReadState struct {
Index uint64
RequestCtx []byte
}
type readIndexStatus struct {
req pb.Message
index uint64
// NB: this never records 'false', but it's more convenient to use this
// instead of a map[uint64]struct{} due to the API of quorum.VoteResult. If
// this becomes performance sensitive enough (doubtful), quorum.VoteResult
// can change to an API that is closer to that of CommittedIndex.
acks map[uint64]bool
}
type readOnly struct {
option ReadOnlyOption
pendingReadIndex map[string]*readIndexStatus
readIndexQueue []string
}
func newReadOnly(option ReadOnlyOption) *readOnly {
return &readOnly{
option: option,
pendingReadIndex: make(map[string]*readIndexStatus),
}
}
// addRequest adds a read only reuqest into readonly struct.
// `index` is the commit index of the raft state machine when it received
// the read only request.
// `m` is the original read only request message from the local or remote node.
func (ro *readOnly) addRequest(index uint64, m pb.Message) {
s := string(m.Entries[0].Data)
if _, ok := ro.pendingReadIndex[s]; ok {
return
}
ro.pendingReadIndex[s] = &readIndexStatus{index: index, req: m, acks: make(map[uint64]bool)}
ro.readIndexQueue = append(ro.readIndexQueue, s)
}
// recvAck notifies the readonly struct that the raft state machine received
// an acknowledgment of the heartbeat that attached with the read only request
// context.
func (ro *readOnly) recvAck(id uint64, context []byte) map[uint64]bool {
rs, ok := ro.pendingReadIndex[string(context)]
if !ok {
return nil
}
rs.acks[id] = true
return rs.acks
}
// advance advances the read only request queue kept by the readonly struct.
// It dequeues the requests until it finds the read only request that has
// the same context as the given `m`.
func (ro *readOnly) advance(m pb.Message) []*readIndexStatus {
var (
i int
found bool
)
ctx := string(m.Context)
rss := []*readIndexStatus{}
for _, okctx := range ro.readIndexQueue {
i++
rs, ok := ro.pendingReadIndex[okctx]
if !ok {
panic("cannot find corresponding read state from pending map")
}
rss = append(rss, rs)
if okctx == ctx {
found = true
break
}
}
if found {
ro.readIndexQueue = ro.readIndexQueue[i:]
for _, rs := range rss {
delete(ro.pendingReadIndex, string(rs.req.Entries[0].Data))
}
return rss
}
return nil
}
// lastPendingRequestCtx returns the context of the last pending read only
// request in readonly struct.
func (ro *readOnly) lastPendingRequestCtx() string {
if len(ro.readIndexQueue) == 0 {
return ""
}
return ro.readIndexQueue[len(ro.readIndexQueue)-1]
}

106
vendor/go.etcd.io/etcd/raft/status.go generated vendored Normal file
View File

@ -0,0 +1,106 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
pb "go.etcd.io/etcd/raft/raftpb"
"go.etcd.io/etcd/raft/tracker"
)
// Status contains information about this Raft peer and its view of the system.
// The Progress is only populated on the leader.
type Status struct {
BasicStatus
Config tracker.Config
Progress map[uint64]tracker.Progress
}
// BasicStatus contains basic information about the Raft peer. It does not allocate.
type BasicStatus struct {
ID uint64
pb.HardState
SoftState
Applied uint64
LeadTransferee uint64
}
func getProgressCopy(r *raft) map[uint64]tracker.Progress {
m := make(map[uint64]tracker.Progress)
r.prs.Visit(func(id uint64, pr *tracker.Progress) {
var p tracker.Progress
p = *pr
p.Inflights = pr.Inflights.Clone()
pr = nil
m[id] = p
})
return m
}
func getBasicStatus(r *raft) BasicStatus {
s := BasicStatus{
ID: r.id,
LeadTransferee: r.leadTransferee,
}
s.HardState = r.hardState()
s.SoftState = *r.softState()
s.Applied = r.raftLog.applied
return s
}
// getStatus gets a copy of the current raft status.
func getStatus(r *raft) Status {
var s Status
s.BasicStatus = getBasicStatus(r)
if s.RaftState == StateLeader {
s.Progress = getProgressCopy(r)
}
s.Config = r.prs.Config.Clone()
return s
}
// MarshalJSON translates the raft status into JSON.
// TODO: try to simplify this by introducing ID type into raft
func (s Status) MarshalJSON() ([]byte, error) {
j := fmt.Sprintf(`{"id":"%x","term":%d,"vote":"%x","commit":%d,"lead":"%x","raftState":%q,"applied":%d,"progress":{`,
s.ID, s.Term, s.Vote, s.Commit, s.Lead, s.RaftState, s.Applied)
if len(s.Progress) == 0 {
j += "},"
} else {
for k, v := range s.Progress {
subj := fmt.Sprintf(`"%x":{"match":%d,"next":%d,"state":%q},`, k, v.Match, v.Next, v.State)
j += subj
}
// remove the trailing ","
j = j[:len(j)-1] + "},"
}
j += fmt.Sprintf(`"leadtransferee":"%x"}`, s.LeadTransferee)
return []byte(j), nil
}
func (s Status) String() string {
b, err := s.MarshalJSON()
if err != nil {
raftLogger.Panicf("unexpected error: %v", err)
}
return string(b)
}

273
vendor/go.etcd.io/etcd/raft/storage.go generated vendored Normal file
View File

@ -0,0 +1,273 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
"sync"
pb "go.etcd.io/etcd/raft/raftpb"
)
// ErrCompacted is returned by Storage.Entries/Compact when a requested
// index is unavailable because it predates the last snapshot.
var ErrCompacted = errors.New("requested index is unavailable due to compaction")
// ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested
// index is older than the existing snapshot.
var ErrSnapOutOfDate = errors.New("requested index is older than the existing snapshot")
// ErrUnavailable is returned by Storage interface when the requested log entries
// are unavailable.
var ErrUnavailable = errors.New("requested entry at index is unavailable")
// ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required
// snapshot is temporarily unavailable.
var ErrSnapshotTemporarilyUnavailable = errors.New("snapshot is temporarily unavailable")
// Storage is an interface that may be implemented by the application
// to retrieve log entries from storage.
//
// If any Storage method returns an error, the raft instance will
// become inoperable and refuse to participate in elections; the
// application is responsible for cleanup and recovery in this case.
type Storage interface {
// TODO(tbg): split this into two interfaces, LogStorage and StateStorage.
// InitialState returns the saved HardState and ConfState information.
InitialState() (pb.HardState, pb.ConfState, error)
// Entries returns a slice of log entries in the range [lo,hi).
// MaxSize limits the total size of the log entries returned, but
// Entries returns at least one entry if any.
Entries(lo, hi, maxSize uint64) ([]pb.Entry, error)
// Term returns the term of entry i, which must be in the range
// [FirstIndex()-1, LastIndex()]. The term of the entry before
// FirstIndex is retained for matching purposes even though the
// rest of that entry may not be available.
Term(i uint64) (uint64, error)
// LastIndex returns the index of the last entry in the log.
LastIndex() (uint64, error)
// FirstIndex returns the index of the first log entry that is
// possibly available via Entries (older entries have been incorporated
// into the latest Snapshot; if storage only contains the dummy entry the
// first log entry is not available).
FirstIndex() (uint64, error)
// Snapshot returns the most recent snapshot.
// If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable,
// so raft state machine could know that Storage needs some time to prepare
// snapshot and call Snapshot later.
Snapshot() (pb.Snapshot, error)
}
// MemoryStorage implements the Storage interface backed by an
// in-memory array.
type MemoryStorage struct {
// Protects access to all fields. Most methods of MemoryStorage are
// run on the raft goroutine, but Append() is run on an application
// goroutine.
sync.Mutex
hardState pb.HardState
snapshot pb.Snapshot
// ents[i] has raft log position i+snapshot.Metadata.Index
ents []pb.Entry
}
// NewMemoryStorage creates an empty MemoryStorage.
func NewMemoryStorage() *MemoryStorage {
return &MemoryStorage{
// When starting from scratch populate the list with a dummy entry at term zero.
ents: make([]pb.Entry, 1),
}
}
// InitialState implements the Storage interface.
func (ms *MemoryStorage) InitialState() (pb.HardState, pb.ConfState, error) {
return ms.hardState, ms.snapshot.Metadata.ConfState, nil
}
// SetHardState saves the current HardState.
func (ms *MemoryStorage) SetHardState(st pb.HardState) error {
ms.Lock()
defer ms.Unlock()
ms.hardState = st
return nil
}
// Entries implements the Storage interface.
func (ms *MemoryStorage) Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if lo <= offset {
return nil, ErrCompacted
}
if hi > ms.lastIndex()+1 {
raftLogger.Panicf("entries' hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
}
// only contains dummy entries.
if len(ms.ents) == 1 {
return nil, ErrUnavailable
}
ents := ms.ents[lo-offset : hi-offset]
return limitSize(ents, maxSize), nil
}
// Term implements the Storage interface.
func (ms *MemoryStorage) Term(i uint64) (uint64, error) {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if i < offset {
return 0, ErrCompacted
}
if int(i-offset) >= len(ms.ents) {
return 0, ErrUnavailable
}
return ms.ents[i-offset].Term, nil
}
// LastIndex implements the Storage interface.
func (ms *MemoryStorage) LastIndex() (uint64, error) {
ms.Lock()
defer ms.Unlock()
return ms.lastIndex(), nil
}
func (ms *MemoryStorage) lastIndex() uint64 {
return ms.ents[0].Index + uint64(len(ms.ents)) - 1
}
// FirstIndex implements the Storage interface.
func (ms *MemoryStorage) FirstIndex() (uint64, error) {
ms.Lock()
defer ms.Unlock()
return ms.firstIndex(), nil
}
func (ms *MemoryStorage) firstIndex() uint64 {
return ms.ents[0].Index + 1
}
// Snapshot implements the Storage interface.
func (ms *MemoryStorage) Snapshot() (pb.Snapshot, error) {
ms.Lock()
defer ms.Unlock()
return ms.snapshot, nil
}
// ApplySnapshot overwrites the contents of this Storage object with
// those of the given snapshot.
func (ms *MemoryStorage) ApplySnapshot(snap pb.Snapshot) error {
ms.Lock()
defer ms.Unlock()
//handle check for old snapshot being applied
msIndex := ms.snapshot.Metadata.Index
snapIndex := snap.Metadata.Index
if msIndex >= snapIndex {
return ErrSnapOutOfDate
}
ms.snapshot = snap
ms.ents = []pb.Entry{{Term: snap.Metadata.Term, Index: snap.Metadata.Index}}
return nil
}
// CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and
// can be used to reconstruct the state at that point.
// If any configuration changes have been made since the last compaction,
// the result of the last ApplyConfChange must be passed in.
func (ms *MemoryStorage) CreateSnapshot(i uint64, cs *pb.ConfState, data []byte) (pb.Snapshot, error) {
ms.Lock()
defer ms.Unlock()
if i <= ms.snapshot.Metadata.Index {
return pb.Snapshot{}, ErrSnapOutOfDate
}
offset := ms.ents[0].Index
if i > ms.lastIndex() {
raftLogger.Panicf("snapshot %d is out of bound lastindex(%d)", i, ms.lastIndex())
}
ms.snapshot.Metadata.Index = i
ms.snapshot.Metadata.Term = ms.ents[i-offset].Term
if cs != nil {
ms.snapshot.Metadata.ConfState = *cs
}
ms.snapshot.Data = data
return ms.snapshot, nil
}
// Compact discards all log entries prior to compactIndex.
// It is the application's responsibility to not attempt to compact an index
// greater than raftLog.applied.
func (ms *MemoryStorage) Compact(compactIndex uint64) error {
ms.Lock()
defer ms.Unlock()
offset := ms.ents[0].Index
if compactIndex <= offset {
return ErrCompacted
}
if compactIndex > ms.lastIndex() {
raftLogger.Panicf("compact %d is out of bound lastindex(%d)", compactIndex, ms.lastIndex())
}
i := compactIndex - offset
ents := make([]pb.Entry, 1, 1+uint64(len(ms.ents))-i)
ents[0].Index = ms.ents[i].Index
ents[0].Term = ms.ents[i].Term
ents = append(ents, ms.ents[i+1:]...)
ms.ents = ents
return nil
}
// Append the new entries to storage.
// TODO (xiangli): ensure the entries are continuous and
// entries[0].Index > ms.entries[0].Index
func (ms *MemoryStorage) Append(entries []pb.Entry) error {
if len(entries) == 0 {
return nil
}
ms.Lock()
defer ms.Unlock()
first := ms.firstIndex()
last := entries[0].Index + uint64(len(entries)) - 1
// shortcut if there is no new entry.
if last < first {
return nil
}
// truncate compacted entries
if first > entries[0].Index {
entries = entries[first-entries[0].Index:]
}
offset := entries[0].Index - ms.ents[0].Index
switch {
case uint64(len(ms.ents)) > offset:
ms.ents = append([]pb.Entry{}, ms.ents[:offset]...)
ms.ents = append(ms.ents, entries...)
case uint64(len(ms.ents)) == offset:
ms.ents = append(ms.ents, entries...)
default:
raftLogger.Panicf("missing log entry [last: %d, append at: %d]",
ms.lastIndex(), entries[0].Index)
}
return nil
}

132
vendor/go.etcd.io/etcd/raft/tracker/inflights.go generated vendored Normal file
View File

@ -0,0 +1,132 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tracker
// Inflights limits the number of MsgApp (represented by the largest index
// contained within) sent to followers but not yet acknowledged by them. Callers
// use Full() to check whether more messages can be sent, call Add() whenever
// they are sending a new append, and release "quota" via FreeLE() whenever an
// ack is received.
type Inflights struct {
// the starting index in the buffer
start int
// number of inflights in the buffer
count int
// the size of the buffer
size int
// buffer contains the index of the last entry
// inside one message.
buffer []uint64
}
// NewInflights sets up an Inflights that allows up to 'size' inflight messages.
func NewInflights(size int) *Inflights {
return &Inflights{
size: size,
}
}
// Clone returns an *Inflights that is identical to but shares no memory with
// the receiver.
func (in *Inflights) Clone() *Inflights {
ins := *in
ins.buffer = append([]uint64(nil), in.buffer...)
return &ins
}
// Add notifies the Inflights that a new message with the given index is being
// dispatched. Full() must be called prior to Add() to verify that there is room
// for one more message, and consecutive calls to add Add() must provide a
// monotonic sequence of indexes.
func (in *Inflights) Add(inflight uint64) {
if in.Full() {
panic("cannot add into a Full inflights")
}
next := in.start + in.count
size := in.size
if next >= size {
next -= size
}
if next >= len(in.buffer) {
in.grow()
}
in.buffer[next] = inflight
in.count++
}
// grow the inflight buffer by doubling up to inflights.size. We grow on demand
// instead of preallocating to inflights.size to handle systems which have
// thousands of Raft groups per process.
func (in *Inflights) grow() {
newSize := len(in.buffer) * 2
if newSize == 0 {
newSize = 1
} else if newSize > in.size {
newSize = in.size
}
newBuffer := make([]uint64, newSize)
copy(newBuffer, in.buffer)
in.buffer = newBuffer
}
// FreeLE frees the inflights smaller or equal to the given `to` flight.
func (in *Inflights) FreeLE(to uint64) {
if in.count == 0 || to < in.buffer[in.start] {
// out of the left side of the window
return
}
idx := in.start
var i int
for i = 0; i < in.count; i++ {
if to < in.buffer[idx] { // found the first large inflight
break
}
// increase index and maybe rotate
size := in.size
if idx++; idx >= size {
idx -= size
}
}
// free i inflights and set new start index
in.count -= i
in.start = idx
if in.count == 0 {
// inflights is empty, reset the start index so that we don't grow the
// buffer unnecessarily.
in.start = 0
}
}
// FreeFirstOne releases the first inflight. This is a no-op if nothing is
// inflight.
func (in *Inflights) FreeFirstOne() { in.FreeLE(in.buffer[in.start]) }
// Full returns true if no more messages can be sent at the moment.
func (in *Inflights) Full() bool {
return in.count == in.size
}
// Count returns the number of inflight messages.
func (in *Inflights) Count() int { return in.count }
// reset frees all inflights.
func (in *Inflights) reset() {
in.count = 0
in.start = 0
}

259
vendor/go.etcd.io/etcd/raft/tracker/progress.go generated vendored Normal file
View File

@ -0,0 +1,259 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tracker
import (
"fmt"
"sort"
"strings"
)
// Progress represents a followers progress in the view of the leader. Leader
// maintains progresses of all followers, and sends entries to the follower
// based on its progress.
//
// NB(tbg): Progress is basically a state machine whose transitions are mostly
// strewn around `*raft.raft`. Additionally, some fields are only used when in a
// certain State. All of this isn't ideal.
type Progress struct {
Match, Next uint64
// State defines how the leader should interact with the follower.
//
// When in StateProbe, leader sends at most one replication message
// per heartbeat interval. It also probes actual progress of the follower.
//
// When in StateReplicate, leader optimistically increases next
// to the latest entry sent after sending replication message. This is
// an optimized state for fast replicating log entries to the follower.
//
// When in StateSnapshot, leader should have sent out snapshot
// before and stops sending any replication message.
State StateType
// PendingSnapshot is used in StateSnapshot.
// If there is a pending snapshot, the pendingSnapshot will be set to the
// index of the snapshot. If pendingSnapshot is set, the replication process of
// this Progress will be paused. raft will not resend snapshot until the pending one
// is reported to be failed.
PendingSnapshot uint64
// RecentActive is true if the progress is recently active. Receiving any messages
// from the corresponding follower indicates the progress is active.
// RecentActive can be reset to false after an election timeout.
//
// TODO(tbg): the leader should always have this set to true.
RecentActive bool
// ProbeSent is used while this follower is in StateProbe. When ProbeSent is
// true, raft should pause sending replication message to this peer until
// ProbeSent is reset. See ProbeAcked() and IsPaused().
ProbeSent bool
// Inflights is a sliding window for the inflight messages.
// Each inflight message contains one or more log entries.
// The max number of entries per message is defined in raft config as MaxSizePerMsg.
// Thus inflight effectively limits both the number of inflight messages
// and the bandwidth each Progress can use.
// When inflights is Full, no more message should be sent.
// When a leader sends out a message, the index of the last
// entry should be added to inflights. The index MUST be added
// into inflights in order.
// When a leader receives a reply, the previous inflights should
// be freed by calling inflights.FreeLE with the index of the last
// received entry.
Inflights *Inflights
// IsLearner is true if this progress is tracked for a learner.
IsLearner bool
}
// ResetState moves the Progress into the specified State, resetting ProbeSent,
// PendingSnapshot, and Inflights.
func (pr *Progress) ResetState(state StateType) {
pr.ProbeSent = false
pr.PendingSnapshot = 0
pr.State = state
pr.Inflights.reset()
}
func max(a, b uint64) uint64 {
if a > b {
return a
}
return b
}
func min(a, b uint64) uint64 {
if a > b {
return b
}
return a
}
// ProbeAcked is called when this peer has accepted an append. It resets
// ProbeSent to signal that additional append messages should be sent without
// further delay.
func (pr *Progress) ProbeAcked() {
pr.ProbeSent = false
}
// BecomeProbe transitions into StateProbe. Next is reset to Match+1 or,
// optionally and if larger, the index of the pending snapshot.
func (pr *Progress) BecomeProbe() {
// If the original state is StateSnapshot, progress knows that
// the pending snapshot has been sent to this peer successfully, then
// probes from pendingSnapshot + 1.
if pr.State == StateSnapshot {
pendingSnapshot := pr.PendingSnapshot
pr.ResetState(StateProbe)
pr.Next = max(pr.Match+1, pendingSnapshot+1)
} else {
pr.ResetState(StateProbe)
pr.Next = pr.Match + 1
}
}
// BecomeReplicate transitions into StateReplicate, resetting Next to Match+1.
func (pr *Progress) BecomeReplicate() {
pr.ResetState(StateReplicate)
pr.Next = pr.Match + 1
}
// BecomeSnapshot moves the Progress to StateSnapshot with the specified pending
// snapshot index.
func (pr *Progress) BecomeSnapshot(snapshoti uint64) {
pr.ResetState(StateSnapshot)
pr.PendingSnapshot = snapshoti
}
// MaybeUpdate is called when an MsgAppResp arrives from the follower, with the
// index acked by it. The method returns false if the given n index comes from
// an outdated message. Otherwise it updates the progress and returns true.
func (pr *Progress) MaybeUpdate(n uint64) bool {
var updated bool
if pr.Match < n {
pr.Match = n
updated = true
pr.ProbeAcked()
}
if pr.Next < n+1 {
pr.Next = n + 1
}
return updated
}
// OptimisticUpdate signals that appends all the way up to and including index n
// are in-flight. As a result, Next is increased to n+1.
func (pr *Progress) OptimisticUpdate(n uint64) { pr.Next = n + 1 }
// MaybeDecrTo adjusts the Progress to the receipt of a MsgApp rejection. The
// arguments are the index the follower rejected to append to its log, and its
// last index.
//
// Rejections can happen spuriously as messages are sent out of order or
// duplicated. In such cases, the rejection pertains to an index that the
// Progress already knows were previously acknowledged, and false is returned
// without changing the Progress.
//
// If the rejection is genuine, Next is lowered sensibly, and the Progress is
// cleared for sending log entries.
func (pr *Progress) MaybeDecrTo(rejected, last uint64) bool {
if pr.State == StateReplicate {
// The rejection must be stale if the progress has matched and "rejected"
// is smaller than "match".
if rejected <= pr.Match {
return false
}
// Directly decrease next to match + 1.
//
// TODO(tbg): why not use last if it's larger?
pr.Next = pr.Match + 1
return true
}
// The rejection must be stale if "rejected" does not match next - 1. This
// is because non-replicating followers are probed one entry at a time.
if pr.Next-1 != rejected {
return false
}
if pr.Next = min(rejected, last+1); pr.Next < 1 {
pr.Next = 1
}
pr.ProbeSent = false
return true
}
// IsPaused returns whether sending log entries to this node has been throttled.
// This is done when a node has rejected recent MsgApps, is currently waiting
// for a snapshot, or has reached the MaxInflightMsgs limit. In normal
// operation, this is false. A throttled node will be contacted less frequently
// until it has reached a state in which it's able to accept a steady stream of
// log entries again.
func (pr *Progress) IsPaused() bool {
switch pr.State {
case StateProbe:
return pr.ProbeSent
case StateReplicate:
return pr.Inflights.Full()
case StateSnapshot:
return true
default:
panic("unexpected state")
}
}
func (pr *Progress) String() string {
var buf strings.Builder
fmt.Fprintf(&buf, "%s match=%d next=%d", pr.State, pr.Match, pr.Next)
if pr.IsLearner {
fmt.Fprint(&buf, " learner")
}
if pr.IsPaused() {
fmt.Fprint(&buf, " paused")
}
if pr.PendingSnapshot > 0 {
fmt.Fprintf(&buf, " pendingSnap=%d", pr.PendingSnapshot)
}
if !pr.RecentActive {
fmt.Fprintf(&buf, " inactive")
}
if n := pr.Inflights.Count(); n > 0 {
fmt.Fprintf(&buf, " inflight=%d", n)
if pr.Inflights.Full() {
fmt.Fprint(&buf, "[full]")
}
}
return buf.String()
}
// ProgressMap is a map of *Progress.
type ProgressMap map[uint64]*Progress
// String prints the ProgressMap in sorted key order, one Progress per line.
func (m ProgressMap) String() string {
ids := make([]uint64, 0, len(m))
for k := range m {
ids = append(ids, k)
}
sort.Slice(ids, func(i, j int) bool {
return ids[i] < ids[j]
})
var buf strings.Builder
for _, id := range ids {
fmt.Fprintf(&buf, "%d: %s\n", id, m[id])
}
return buf.String()
}

42
vendor/go.etcd.io/etcd/raft/tracker/state.go generated vendored Normal file
View File

@ -0,0 +1,42 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tracker
// StateType is the state of a tracked follower.
type StateType uint64
const (
// StateProbe indicates a follower whose last index isn't known. Such a
// follower is "probed" (i.e. an append sent periodically) to narrow down
// its last index. In the ideal (and common) case, only one round of probing
// is necessary as the follower will react with a hint. Followers that are
// probed over extended periods of time are often offline.
StateProbe StateType = iota
// StateReplicate is the state steady in which a follower eagerly receives
// log entries to append to its log.
StateReplicate
// StateSnapshot indicates a follower that needs log entries not available
// from the leader's Raft log. Such a follower needs a full snapshot to
// return to StateReplicate.
StateSnapshot
)
var prstmap = [...]string{
"StateProbe",
"StateReplicate",
"StateSnapshot",
}
func (st StateType) String() string { return prstmap[uint64(st)] }

288
vendor/go.etcd.io/etcd/raft/tracker/tracker.go generated vendored Normal file
View File

@ -0,0 +1,288 @@
// Copyright 2019 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tracker
import (
"fmt"
"sort"
"strings"
"go.etcd.io/etcd/raft/quorum"
pb "go.etcd.io/etcd/raft/raftpb"
)
// Config reflects the configuration tracked in a ProgressTracker.
type Config struct {
Voters quorum.JointConfig
// AutoLeave is true if the configuration is joint and a transition to the
// incoming configuration should be carried out automatically by Raft when
// this is possible. If false, the configuration will be joint until the
// application initiates the transition manually.
AutoLeave bool
// Learners is a set of IDs corresponding to the learners active in the
// current configuration.
//
// Invariant: Learners and Voters does not intersect, i.e. if a peer is in
// either half of the joint config, it can't be a learner; if it is a
// learner it can't be in either half of the joint config. This invariant
// simplifies the implementation since it allows peers to have clarity about
// its current role without taking into account joint consensus.
Learners map[uint64]struct{}
// When we turn a voter into a learner during a joint consensus transition,
// we cannot add the learner directly when entering the joint state. This is
// because this would violate the invariant that the intersection of
// voters and learners is empty. For example, assume a Voter is removed and
// immediately re-added as a learner (or in other words, it is demoted):
//
// Initially, the configuration will be
//
// voters: {1 2 3}
// learners: {}
//
// and we want to demote 3. Entering the joint configuration, we naively get
//
// voters: {1 2} & {1 2 3}
// learners: {3}
//
// but this violates the invariant (3 is both voter and learner). Instead,
// we get
//
// voters: {1 2} & {1 2 3}
// learners: {}
// next_learners: {3}
//
// Where 3 is now still purely a voter, but we are remembering the intention
// to make it a learner upon transitioning into the final configuration:
//
// voters: {1 2}
// learners: {3}
// next_learners: {}
//
// Note that next_learners is not used while adding a learner that is not
// also a voter in the joint config. In this case, the learner is added
// right away when entering the joint configuration, so that it is caught up
// as soon as possible.
LearnersNext map[uint64]struct{}
}
func (c Config) String() string {
var buf strings.Builder
fmt.Fprintf(&buf, "voters=%s", c.Voters)
if c.Learners != nil {
fmt.Fprintf(&buf, " learners=%s", quorum.MajorityConfig(c.Learners).String())
}
if c.LearnersNext != nil {
fmt.Fprintf(&buf, " learners_next=%s", quorum.MajorityConfig(c.LearnersNext).String())
}
if c.AutoLeave {
fmt.Fprintf(&buf, " autoleave")
}
return buf.String()
}
// Clone returns a copy of the Config that shares no memory with the original.
func (c *Config) Clone() Config {
clone := func(m map[uint64]struct{}) map[uint64]struct{} {
if m == nil {
return nil
}
mm := make(map[uint64]struct{}, len(m))
for k := range m {
mm[k] = struct{}{}
}
return mm
}
return Config{
Voters: quorum.JointConfig{clone(c.Voters[0]), clone(c.Voters[1])},
Learners: clone(c.Learners),
LearnersNext: clone(c.LearnersNext),
}
}
// ProgressTracker tracks the currently active configuration and the information
// known about the nodes and learners in it. In particular, it tracks the match
// index for each peer which in turn allows reasoning about the committed index.
type ProgressTracker struct {
Config
Progress ProgressMap
Votes map[uint64]bool
MaxInflight int
}
// MakeProgressTracker initializes a ProgressTracker.
func MakeProgressTracker(maxInflight int) ProgressTracker {
p := ProgressTracker{
MaxInflight: maxInflight,
Config: Config{
Voters: quorum.JointConfig{
quorum.MajorityConfig{},
nil, // only populated when used
},
Learners: nil, // only populated when used
LearnersNext: nil, // only populated when used
},
Votes: map[uint64]bool{},
Progress: map[uint64]*Progress{},
}
return p
}
// ConfState returns a ConfState representing the active configuration.
func (p *ProgressTracker) ConfState() pb.ConfState {
return pb.ConfState{
Voters: p.Voters[0].Slice(),
VotersOutgoing: p.Voters[1].Slice(),
Learners: quorum.MajorityConfig(p.Learners).Slice(),
LearnersNext: quorum.MajorityConfig(p.LearnersNext).Slice(),
AutoLeave: p.AutoLeave,
}
}
// IsSingleton returns true if (and only if) there is only one voting member
// (i.e. the leader) in the current configuration.
func (p *ProgressTracker) IsSingleton() bool {
return len(p.Voters[0]) == 1 && len(p.Voters[1]) == 0
}
type matchAckIndexer map[uint64]*Progress
var _ quorum.AckedIndexer = matchAckIndexer(nil)
// AckedIndex implements IndexLookuper.
func (l matchAckIndexer) AckedIndex(id uint64) (quorum.Index, bool) {
pr, ok := l[id]
if !ok {
return 0, false
}
return quorum.Index(pr.Match), true
}
// Committed returns the largest log index known to be committed based on what
// the voting members of the group have acknowledged.
func (p *ProgressTracker) Committed() uint64 {
return uint64(p.Voters.CommittedIndex(matchAckIndexer(p.Progress)))
}
func insertionSort(sl []uint64) {
a, b := 0, len(sl)
for i := a + 1; i < b; i++ {
for j := i; j > a && sl[j] < sl[j-1]; j-- {
sl[j], sl[j-1] = sl[j-1], sl[j]
}
}
}
// Visit invokes the supplied closure for all tracked progresses in stable order.
func (p *ProgressTracker) Visit(f func(id uint64, pr *Progress)) {
n := len(p.Progress)
// We need to sort the IDs and don't want to allocate since this is hot code.
// The optimization here mirrors that in `(MajorityConfig).CommittedIndex`,
// see there for details.
var sl [7]uint64
ids := sl[:]
if len(sl) >= n {
ids = sl[:n]
} else {
ids = make([]uint64, n)
}
for id := range p.Progress {
n--
ids[n] = id
}
insertionSort(ids)
for _, id := range ids {
f(id, p.Progress[id])
}
}
// QuorumActive returns true if the quorum is active from the view of the local
// raft state machine. Otherwise, it returns false.
func (p *ProgressTracker) QuorumActive() bool {
votes := map[uint64]bool{}
p.Visit(func(id uint64, pr *Progress) {
if pr.IsLearner {
return
}
votes[id] = pr.RecentActive
})
return p.Voters.VoteResult(votes) == quorum.VoteWon
}
// VoterNodes returns a sorted slice of voters.
func (p *ProgressTracker) VoterNodes() []uint64 {
m := p.Voters.IDs()
nodes := make([]uint64, 0, len(m))
for id := range m {
nodes = append(nodes, id)
}
sort.Slice(nodes, func(i, j int) bool { return nodes[i] < nodes[j] })
return nodes
}
// LearnerNodes returns a sorted slice of learners.
func (p *ProgressTracker) LearnerNodes() []uint64 {
if len(p.Learners) == 0 {
return nil
}
nodes := make([]uint64, 0, len(p.Learners))
for id := range p.Learners {
nodes = append(nodes, id)
}
sort.Slice(nodes, func(i, j int) bool { return nodes[i] < nodes[j] })
return nodes
}
// ResetVotes prepares for a new round of vote counting via recordVote.
func (p *ProgressTracker) ResetVotes() {
p.Votes = map[uint64]bool{}
}
// RecordVote records that the node with the given id voted for this Raft
// instance if v == true (and declined it otherwise).
func (p *ProgressTracker) RecordVote(id uint64, v bool) {
_, ok := p.Votes[id]
if !ok {
p.Votes[id] = v
}
}
// TallyVotes returns the number of granted and rejected Votes, and whether the
// election outcome is known.
func (p *ProgressTracker) TallyVotes() (granted int, rejected int, _ quorum.VoteResult) {
// Make sure to populate granted/rejected correctly even if the Votes slice
// contains members no longer part of the configuration. This doesn't really
// matter in the way the numbers are used (they're informational), but might
// as well get it right.
for id, pr := range p.Progress {
if pr.IsLearner {
continue
}
v, voted := p.Votes[id]
if !voted {
continue
}
if v {
granted++
} else {
rejected++
}
}
result := p.Voters.VoteResult(p.Votes)
return granted, rejected, result
}

233
vendor/go.etcd.io/etcd/raft/util.go generated vendored Normal file
View File

@ -0,0 +1,233 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"bytes"
"fmt"
"strings"
pb "go.etcd.io/etcd/raft/raftpb"
)
func (st StateType) MarshalJSON() ([]byte, error) {
return []byte(fmt.Sprintf("%q", st.String())), nil
}
func min(a, b uint64) uint64 {
if a > b {
return b
}
return a
}
func max(a, b uint64) uint64 {
if a > b {
return a
}
return b
}
func IsLocalMsg(msgt pb.MessageType) bool {
return msgt == pb.MsgHup || msgt == pb.MsgBeat || msgt == pb.MsgUnreachable ||
msgt == pb.MsgSnapStatus || msgt == pb.MsgCheckQuorum
}
func IsResponseMsg(msgt pb.MessageType) bool {
return msgt == pb.MsgAppResp || msgt == pb.MsgVoteResp || msgt == pb.MsgHeartbeatResp || msgt == pb.MsgUnreachable || msgt == pb.MsgPreVoteResp
}
// voteResponseType maps vote and prevote message types to their corresponding responses.
func voteRespMsgType(msgt pb.MessageType) pb.MessageType {
switch msgt {
case pb.MsgVote:
return pb.MsgVoteResp
case pb.MsgPreVote:
return pb.MsgPreVoteResp
default:
panic(fmt.Sprintf("not a vote message: %s", msgt))
}
}
func DescribeHardState(hs pb.HardState) string {
var buf strings.Builder
fmt.Fprintf(&buf, "Term:%d", hs.Term)
if hs.Vote != 0 {
fmt.Fprintf(&buf, " Vote:%d", hs.Vote)
}
fmt.Fprintf(&buf, " Commit:%d", hs.Commit)
return buf.String()
}
func DescribeSoftState(ss SoftState) string {
return fmt.Sprintf("Lead:%d State:%s", ss.Lead, ss.RaftState)
}
func DescribeConfState(state pb.ConfState) string {
return fmt.Sprintf(
"Voters:%v VotersOutgoing:%v Learners:%v LearnersNext:%v AutoLeave:%v",
state.Voters, state.VotersOutgoing, state.Learners, state.LearnersNext, state.AutoLeave,
)
}
func DescribeSnapshot(snap pb.Snapshot) string {
m := snap.Metadata
return fmt.Sprintf("Index:%d Term:%d ConfState:%s", m.Index, m.Term, DescribeConfState(m.ConfState))
}
func DescribeReady(rd Ready, f EntryFormatter) string {
var buf strings.Builder
if rd.SoftState != nil {
fmt.Fprint(&buf, DescribeSoftState(*rd.SoftState))
buf.WriteByte('\n')
}
if !IsEmptyHardState(rd.HardState) {
fmt.Fprintf(&buf, "HardState %s", DescribeHardState(rd.HardState))
buf.WriteByte('\n')
}
if len(rd.ReadStates) > 0 {
fmt.Fprintf(&buf, "ReadStates %v\n", rd.ReadStates)
}
if len(rd.Entries) > 0 {
buf.WriteString("Entries:\n")
fmt.Fprint(&buf, DescribeEntries(rd.Entries, f))
}
if !IsEmptySnap(rd.Snapshot) {
fmt.Fprintf(&buf, "Snapshot %s\n", DescribeSnapshot(rd.Snapshot))
}
if len(rd.CommittedEntries) > 0 {
buf.WriteString("CommittedEntries:\n")
fmt.Fprint(&buf, DescribeEntries(rd.CommittedEntries, f))
}
if len(rd.Messages) > 0 {
buf.WriteString("Messages:\n")
for _, msg := range rd.Messages {
fmt.Fprint(&buf, DescribeMessage(msg, f))
buf.WriteByte('\n')
}
}
if buf.Len() > 0 {
return fmt.Sprintf("Ready MustSync=%t:\n%s", rd.MustSync, buf.String())
}
return "<empty Ready>"
}
// EntryFormatter can be implemented by the application to provide human-readable formatting
// of entry data. Nil is a valid EntryFormatter and will use a default format.
type EntryFormatter func([]byte) string
// DescribeMessage returns a concise human-readable description of a
// Message for debugging.
func DescribeMessage(m pb.Message, f EntryFormatter) string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "%x->%x %v Term:%d Log:%d/%d", m.From, m.To, m.Type, m.Term, m.LogTerm, m.Index)
if m.Reject {
fmt.Fprintf(&buf, " Rejected (Hint: %d)", m.RejectHint)
}
if m.Commit != 0 {
fmt.Fprintf(&buf, " Commit:%d", m.Commit)
}
if len(m.Entries) > 0 {
fmt.Fprintf(&buf, " Entries:[")
for i, e := range m.Entries {
if i != 0 {
buf.WriteString(", ")
}
buf.WriteString(DescribeEntry(e, f))
}
fmt.Fprintf(&buf, "]")
}
if !IsEmptySnap(m.Snapshot) {
fmt.Fprintf(&buf, " Snapshot: %s", DescribeSnapshot(m.Snapshot))
}
return buf.String()
}
// PayloadSize is the size of the payload of this Entry. Notably, it does not
// depend on its Index or Term.
func PayloadSize(e pb.Entry) int {
return len(e.Data)
}
// DescribeEntry returns a concise human-readable description of an
// Entry for debugging.
func DescribeEntry(e pb.Entry, f EntryFormatter) string {
if f == nil {
f = func(data []byte) string { return fmt.Sprintf("%q", data) }
}
formatConfChange := func(cc pb.ConfChangeI) string {
// TODO(tbg): give the EntryFormatter a type argument so that it gets
// a chance to expose the Context.
return pb.ConfChangesToString(cc.AsV2().Changes)
}
var formatted string
switch e.Type {
case pb.EntryNormal:
formatted = f(e.Data)
case pb.EntryConfChange:
var cc pb.ConfChange
if err := cc.Unmarshal(e.Data); err != nil {
formatted = err.Error()
} else {
formatted = formatConfChange(cc)
}
case pb.EntryConfChangeV2:
var cc pb.ConfChangeV2
if err := cc.Unmarshal(e.Data); err != nil {
formatted = err.Error()
} else {
formatted = formatConfChange(cc)
}
}
if formatted != "" {
formatted = " " + formatted
}
return fmt.Sprintf("%d/%d %s%s", e.Term, e.Index, e.Type, formatted)
}
// DescribeEntries calls DescribeEntry for each Entry, adding a newline to
// each.
func DescribeEntries(ents []pb.Entry, f EntryFormatter) string {
var buf bytes.Buffer
for _, e := range ents {
_, _ = buf.WriteString(DescribeEntry(e, f) + "\n")
}
return buf.String()
}
func limitSize(ents []pb.Entry, maxSize uint64) []pb.Entry {
if len(ents) == 0 {
return ents
}
size := ents[0].Size()
var limit int
for limit = 1; limit < len(ents); limit++ {
size += ents[limit].Size()
if uint64(size) > maxSize {
break
}
}
return ents[:limit]
}
func assertConfStatesEquivalent(l Logger, cs1, cs2 pb.ConfState) {
err := cs1.Equivalent(cs2)
if err == nil {
return
}
l.Panic(err)
}