mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-06-13 10:33:35 +00:00
Migrate from dep to go module
Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
This commit is contained in:
committed by
mergify[bot]
parent
a9174dd953
commit
d5a0606c33
1
vendor/go.etcd.io/etcd/Documentation/README.md
generated
vendored
1
vendor/go.etcd.io/etcd/Documentation/README.md
generated
vendored
@ -1 +0,0 @@
|
||||
docs.md
|
42
vendor/go.etcd.io/etcd/auth/authpb/auth.proto
generated
vendored
Normal file
42
vendor/go.etcd.io/etcd/auth/authpb/auth.proto
generated
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
syntax = "proto3";
|
||||
package authpb;
|
||||
|
||||
import "gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
option (gogoproto.goproto_enum_prefix_all) = false;
|
||||
|
||||
message UserAddOptions {
|
||||
bool no_password = 1;
|
||||
};
|
||||
|
||||
// User is a single entry in the bucket authUsers
|
||||
message User {
|
||||
bytes name = 1;
|
||||
bytes password = 2;
|
||||
repeated string roles = 3;
|
||||
UserAddOptions options = 4;
|
||||
}
|
||||
|
||||
// Permission is a single entity
|
||||
message Permission {
|
||||
enum Type {
|
||||
READ = 0;
|
||||
WRITE = 1;
|
||||
READWRITE = 2;
|
||||
}
|
||||
Type permType = 1;
|
||||
|
||||
bytes key = 2;
|
||||
bytes range_end = 3;
|
||||
}
|
||||
|
||||
// Role is a single entry in the bucket authRoles
|
||||
message Role {
|
||||
bytes name = 1;
|
||||
|
||||
repeated Permission keyPermission = 2;
|
||||
}
|
85
vendor/go.etcd.io/etcd/clientv3/README.md
generated
vendored
Normal file
85
vendor/go.etcd.io/etcd/clientv3/README.md
generated
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
# etcd/clientv3
|
||||
|
||||
[](https://etcd.io/docs)
|
||||
[](https://godoc.org/go.etcd.io/etcd/clientv3)
|
||||
|
||||
`etcd/clientv3` is the official Go etcd client for v3.
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
go get go.etcd.io/etcd/clientv3
|
||||
```
|
||||
|
||||
## Get started
|
||||
|
||||
Create client using `clientv3.New`:
|
||||
|
||||
```go
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
Endpoints: []string{"localhost:2379", "localhost:22379", "localhost:32379"},
|
||||
DialTimeout: 5 * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
// handle error!
|
||||
}
|
||||
defer cli.Close()
|
||||
```
|
||||
|
||||
etcd v3 uses [`gRPC`](https://www.grpc.io) for remote procedure calls. And `clientv3` uses
|
||||
[`grpc-go`](https://github.com/grpc/grpc-go) to connect to etcd. Make sure to close the client after using it.
|
||||
If the client is not closed, the connection will have leaky goroutines. To specify client request timeout,
|
||||
pass `context.WithTimeout` to APIs:
|
||||
|
||||
```go
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
resp, err := cli.Put(ctx, "sample_key", "sample_value")
|
||||
cancel()
|
||||
if err != nil {
|
||||
// handle error!
|
||||
}
|
||||
// use the response
|
||||
```
|
||||
|
||||
For full compatibility, it is recommended to vendor builds using etcd's vendored packages, using tools like `golang/dep`, as in [vendor directories](https://golang.org/cmd/go/#hdr-Vendor_Directories).
|
||||
|
||||
## Error Handling
|
||||
|
||||
etcd client returns 2 types of errors:
|
||||
|
||||
1. context error: canceled or deadline exceeded.
|
||||
2. gRPC error: see [api/v3rpc/rpctypes](https://godoc.org/go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes).
|
||||
|
||||
Here is the example code to handle client errors:
|
||||
|
||||
```go
|
||||
resp, err := cli.Put(ctx, "", "")
|
||||
if err != nil {
|
||||
switch err {
|
||||
case context.Canceled:
|
||||
log.Fatalf("ctx is canceled by another routine: %v", err)
|
||||
case context.DeadlineExceeded:
|
||||
log.Fatalf("ctx is attached with a deadline is exceeded: %v", err)
|
||||
case rpctypes.ErrEmptyKey:
|
||||
log.Fatalf("client-side error: %v", err)
|
||||
default:
|
||||
log.Fatalf("bad cluster endpoints, which are not etcd servers: %v", err)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Metrics
|
||||
|
||||
The etcd client optionally exposes RPC metrics through [go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus). See the [examples](https://github.com/etcd-io/etcd/blob/master/clientv3/example_metrics_test.go).
|
||||
|
||||
## Namespacing
|
||||
|
||||
The [namespace](https://godoc.org/go.etcd.io/etcd/clientv3/namespace) package provides `clientv3` interface wrappers to transparently isolate client requests to a user-defined prefix.
|
||||
|
||||
## Request size limit
|
||||
|
||||
Client request size limit is configurable via `clientv3.Config.MaxCallSendMsgSize` and `MaxCallRecvMsgSize` in bytes. If none given, client request send limit defaults to 2 MiB including gRPC overhead bytes. And receive limit defaults to `math.MaxInt32`.
|
||||
|
||||
## Examples
|
||||
|
||||
More code examples can be found at [GoDoc](https://godoc.org/go.etcd.io/etcd/clientv3).
|
34
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/etcdserver.proto
generated
vendored
Normal file
34
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/etcdserver.proto
generated
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
syntax = "proto2";
|
||||
package etcdserverpb;
|
||||
|
||||
import "gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
|
||||
message Request {
|
||||
optional uint64 ID = 1 [(gogoproto.nullable) = false];
|
||||
optional string Method = 2 [(gogoproto.nullable) = false];
|
||||
optional string Path = 3 [(gogoproto.nullable) = false];
|
||||
optional string Val = 4 [(gogoproto.nullable) = false];
|
||||
optional bool Dir = 5 [(gogoproto.nullable) = false];
|
||||
optional string PrevValue = 6 [(gogoproto.nullable) = false];
|
||||
optional uint64 PrevIndex = 7 [(gogoproto.nullable) = false];
|
||||
optional bool PrevExist = 8 [(gogoproto.nullable) = true];
|
||||
optional int64 Expiration = 9 [(gogoproto.nullable) = false];
|
||||
optional bool Wait = 10 [(gogoproto.nullable) = false];
|
||||
optional uint64 Since = 11 [(gogoproto.nullable) = false];
|
||||
optional bool Recursive = 12 [(gogoproto.nullable) = false];
|
||||
optional bool Sorted = 13 [(gogoproto.nullable) = false];
|
||||
optional bool Quorum = 14 [(gogoproto.nullable) = false];
|
||||
optional int64 Time = 15 [(gogoproto.nullable) = false];
|
||||
optional bool Stream = 16 [(gogoproto.nullable) = false];
|
||||
optional bool Refresh = 17 [(gogoproto.nullable) = true];
|
||||
}
|
||||
|
||||
message Metadata {
|
||||
optional uint64 NodeID = 1 [(gogoproto.nullable) = false];
|
||||
optional uint64 ClusterID = 2 [(gogoproto.nullable) = false];
|
||||
}
|
75
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/raft_internal.proto
generated
vendored
Normal file
75
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/raft_internal.proto
generated
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
syntax = "proto3";
|
||||
package etcdserverpb;
|
||||
|
||||
import "gogoproto/gogo.proto";
|
||||
import "etcdserver.proto";
|
||||
import "rpc.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
|
||||
message RequestHeader {
|
||||
uint64 ID = 1;
|
||||
// username is a username that is associated with an auth token of gRPC connection
|
||||
string username = 2;
|
||||
// auth_revision is a revision number of auth.authStore. It is not related to mvcc
|
||||
uint64 auth_revision = 3;
|
||||
}
|
||||
|
||||
// An InternalRaftRequest is the union of all requests which can be
|
||||
// sent via raft.
|
||||
message InternalRaftRequest {
|
||||
RequestHeader header = 100;
|
||||
uint64 ID = 1;
|
||||
|
||||
Request v2 = 2;
|
||||
|
||||
RangeRequest range = 3;
|
||||
PutRequest put = 4;
|
||||
DeleteRangeRequest delete_range = 5;
|
||||
TxnRequest txn = 6;
|
||||
CompactionRequest compaction = 7;
|
||||
|
||||
LeaseGrantRequest lease_grant = 8;
|
||||
LeaseRevokeRequest lease_revoke = 9;
|
||||
|
||||
AlarmRequest alarm = 10;
|
||||
|
||||
LeaseCheckpointRequest lease_checkpoint = 11;
|
||||
|
||||
AuthEnableRequest auth_enable = 1000;
|
||||
AuthDisableRequest auth_disable = 1011;
|
||||
|
||||
InternalAuthenticateRequest authenticate = 1012;
|
||||
|
||||
AuthUserAddRequest auth_user_add = 1100;
|
||||
AuthUserDeleteRequest auth_user_delete = 1101;
|
||||
AuthUserGetRequest auth_user_get = 1102;
|
||||
AuthUserChangePasswordRequest auth_user_change_password = 1103;
|
||||
AuthUserGrantRoleRequest auth_user_grant_role = 1104;
|
||||
AuthUserRevokeRoleRequest auth_user_revoke_role = 1105;
|
||||
AuthUserListRequest auth_user_list = 1106;
|
||||
AuthRoleListRequest auth_role_list = 1107;
|
||||
|
||||
AuthRoleAddRequest auth_role_add = 1200;
|
||||
AuthRoleDeleteRequest auth_role_delete = 1201;
|
||||
AuthRoleGetRequest auth_role_get = 1202;
|
||||
AuthRoleGrantPermissionRequest auth_role_grant_permission = 1203;
|
||||
AuthRoleRevokePermissionRequest auth_role_revoke_permission = 1204;
|
||||
}
|
||||
|
||||
message EmptyResponse {
|
||||
}
|
||||
|
||||
// What is the difference between AuthenticateRequest (defined in rpc.proto) and InternalAuthenticateRequest?
|
||||
// InternalAuthenticateRequest has a member that is filled by etcdserver and shouldn't be user-facing.
|
||||
// For avoiding misusage the field, we have an internal version of AuthenticateRequest.
|
||||
message InternalAuthenticateRequest {
|
||||
string name = 1;
|
||||
string password = 2;
|
||||
|
||||
// simple_token is generated in API layer (etcdserver/v3_server.go)
|
||||
string simple_token = 3;
|
||||
}
|
1146
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/rpc.proto
generated
vendored
Normal file
1146
vendor/go.etcd.io/etcd/etcdserver/etcdserverpb/rpc.proto
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
49
vendor/go.etcd.io/etcd/mvcc/mvccpb/kv.proto
generated
vendored
Normal file
49
vendor/go.etcd.io/etcd/mvcc/mvccpb/kv.proto
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
syntax = "proto3";
|
||||
package mvccpb;
|
||||
|
||||
import "gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
option (gogoproto.goproto_enum_prefix_all) = false;
|
||||
|
||||
message KeyValue {
|
||||
// key is the key in bytes. An empty key is not allowed.
|
||||
bytes key = 1;
|
||||
// create_revision is the revision of last creation on this key.
|
||||
int64 create_revision = 2;
|
||||
// mod_revision is the revision of last modification on this key.
|
||||
int64 mod_revision = 3;
|
||||
// version is the version of the key. A deletion resets
|
||||
// the version to zero and any modification of the key
|
||||
// increases its version.
|
||||
int64 version = 4;
|
||||
// value is the value held by the key, in bytes.
|
||||
bytes value = 5;
|
||||
// lease is the ID of the lease that attached to key.
|
||||
// When the attached lease expires, the key will be deleted.
|
||||
// If lease is 0, then no lease is attached to the key.
|
||||
int64 lease = 6;
|
||||
}
|
||||
|
||||
message Event {
|
||||
enum EventType {
|
||||
PUT = 0;
|
||||
DELETE = 1;
|
||||
}
|
||||
// type is the kind of event. If type is a PUT, it indicates
|
||||
// new data has been stored to the key. If type is a DELETE,
|
||||
// it indicates the key was deleted.
|
||||
EventType type = 1;
|
||||
// kv holds the KeyValue for the event.
|
||||
// A PUT event contains current kv pair.
|
||||
// A PUT event with kv.Version=1 indicates the creation of a key.
|
||||
// A DELETE/EXPIRE event contains the deleted key with
|
||||
// its modification revision set to the revision of deletion.
|
||||
KeyValue kv = 2;
|
||||
|
||||
// prev_kv holds the key-value pair before the event happens.
|
||||
KeyValue prev_kv = 3;
|
||||
}
|
19
vendor/go.etcd.io/etcd/raft/OWNERS
generated
vendored
Normal file
19
vendor/go.etcd.io/etcd/raft/OWNERS
generated
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
approvers:
|
||||
- heyitsanthony
|
||||
- philips
|
||||
- fanminshi
|
||||
- gyuho
|
||||
- mitake
|
||||
- jpbetz
|
||||
- xiang90
|
||||
- bdarnell
|
||||
reviewers:
|
||||
- heyitsanthony
|
||||
- philips
|
||||
- fanminshi
|
||||
- gyuho
|
||||
- mitake
|
||||
- jpbetz
|
||||
- xiang90
|
||||
- bdarnell
|
||||
- tschottdorf
|
197
vendor/go.etcd.io/etcd/raft/README.md
generated
vendored
Normal file
197
vendor/go.etcd.io/etcd/raft/README.md
generated
vendored
Normal file
@ -0,0 +1,197 @@
|
||||
# Raft library
|
||||
|
||||
Raft is a protocol with which a cluster of nodes can maintain a replicated state machine.
|
||||
The state machine is kept in sync through the use of a replicated log.
|
||||
For more details on Raft, see "In Search of an Understandable Consensus Algorithm"
|
||||
(https://raft.github.io/raft.pdf) by Diego Ongaro and John Ousterhout.
|
||||
|
||||
This Raft library is stable and feature complete. As of 2016, it is **the most widely used** Raft library in production, serving tens of thousands clusters each day. It powers distributed systems such as etcd, Kubernetes, Docker Swarm, Cloud Foundry Diego, CockroachDB, TiDB, Project Calico, Flannel, and more.
|
||||
|
||||
Most Raft implementations have a monolithic design, including storage handling, messaging serialization, and network transport. This library instead follows a minimalistic design philosophy by only implementing the core raft algorithm. This minimalism buys flexibility, determinism, and performance.
|
||||
|
||||
To keep the codebase small as well as provide flexibility, the library only implements the Raft algorithm; both network and disk IO are left to the user. Library users must implement their own transportation layer for message passing between Raft peers over the wire. Similarly, users must implement their own storage layer to persist the Raft log and state.
|
||||
|
||||
In order to easily test the Raft library, its behavior should be deterministic. To achieve this determinism, the library models Raft as a state machine. The state machine takes a `Message` as input. A message can either be a local timer update or a network message sent from a remote peer. The state machine's output is a 3-tuple `{[]Messages, []LogEntries, NextState}` consisting of an array of `Messages`, `log entries`, and `Raft state changes`. For state machines with the same state, the same state machine input should always generate the same state machine output.
|
||||
|
||||
A simple example application, _raftexample_, is also available to help illustrate how to use this package in practice: https://github.com/etcd-io/etcd/tree/master/contrib/raftexample
|
||||
|
||||
# Features
|
||||
|
||||
This raft implementation is a full feature implementation of Raft protocol. Features includes:
|
||||
|
||||
- Leader election
|
||||
- Log replication
|
||||
- Log compaction
|
||||
- Membership changes
|
||||
- Leadership transfer extension
|
||||
- Efficient linearizable read-only queries served by both the leader and followers
|
||||
- leader checks with quorum and bypasses Raft log before processing read-only queries
|
||||
- followers asks leader to get a safe read index before processing read-only queries
|
||||
- More efficient lease-based linearizable read-only queries served by both the leader and followers
|
||||
- leader bypasses Raft log and processing read-only queries locally
|
||||
- followers asks leader to get a safe read index before processing read-only queries
|
||||
- this approach relies on the clock of the all the machines in raft group
|
||||
|
||||
This raft implementation also includes a few optional enhancements:
|
||||
|
||||
- Optimistic pipelining to reduce log replication latency
|
||||
- Flow control for log replication
|
||||
- Batching Raft messages to reduce synchronized network I/O calls
|
||||
- Batching log entries to reduce disk synchronized I/O
|
||||
- Writing to leader's disk in parallel
|
||||
- Internal proposal redirection from followers to leader
|
||||
- Automatic stepping down when the leader loses quorum
|
||||
- Protection against unbounded log growth when quorum is lost
|
||||
|
||||
## Notable Users
|
||||
|
||||
- [cockroachdb](https://github.com/cockroachdb/cockroach) A Scalable, Survivable, Strongly-Consistent SQL Database
|
||||
- [dgraph](https://github.com/dgraph-io/dgraph) A Scalable, Distributed, Low Latency, High Throughput Graph Database
|
||||
- [etcd](https://github.com/etcd-io/etcd) A distributed reliable key-value store
|
||||
- [tikv](https://github.com/pingcap/tikv) A Distributed transactional key value database powered by Rust and Raft
|
||||
- [swarmkit](https://github.com/docker/swarmkit) A toolkit for orchestrating distributed systems at any scale.
|
||||
- [chain core](https://github.com/chain/chain) Software for operating permissioned, multi-asset blockchain networks
|
||||
|
||||
## Usage
|
||||
|
||||
The primary object in raft is a Node. Either start a Node from scratch using raft.StartNode or start a Node from some initial state using raft.RestartNode.
|
||||
|
||||
To start a three-node cluster
|
||||
```go
|
||||
storage := raft.NewMemoryStorage()
|
||||
c := &Config{
|
||||
ID: 0x01,
|
||||
ElectionTick: 10,
|
||||
HeartbeatTick: 1,
|
||||
Storage: storage,
|
||||
MaxSizePerMsg: 4096,
|
||||
MaxInflightMsgs: 256,
|
||||
}
|
||||
// Set peer list to the other nodes in the cluster.
|
||||
// Note that they need to be started separately as well.
|
||||
n := raft.StartNode(c, []raft.Peer{{ID: 0x02}, {ID: 0x03}})
|
||||
```
|
||||
|
||||
Start a single node cluster, like so:
|
||||
```go
|
||||
// Create storage and config as shown above.
|
||||
// Set peer list to itself, so this node can become the leader of this single-node cluster.
|
||||
peers := []raft.Peer{{ID: 0x01}}
|
||||
n := raft.StartNode(c, peers)
|
||||
```
|
||||
|
||||
To allow a new node to join this cluster, do not pass in any peers. First, add the node to the existing cluster by calling `ProposeConfChange` on any existing node inside the cluster. Then, start the node with an empty peer list, like so:
|
||||
```go
|
||||
// Create storage and config as shown above.
|
||||
n := raft.StartNode(c, nil)
|
||||
```
|
||||
|
||||
To restart a node from previous state:
|
||||
```go
|
||||
storage := raft.NewMemoryStorage()
|
||||
|
||||
// Recover the in-memory storage from persistent snapshot, state and entries.
|
||||
storage.ApplySnapshot(snapshot)
|
||||
storage.SetHardState(state)
|
||||
storage.Append(entries)
|
||||
|
||||
c := &Config{
|
||||
ID: 0x01,
|
||||
ElectionTick: 10,
|
||||
HeartbeatTick: 1,
|
||||
Storage: storage,
|
||||
MaxSizePerMsg: 4096,
|
||||
MaxInflightMsgs: 256,
|
||||
}
|
||||
|
||||
// Restart raft without peer information.
|
||||
// Peer information is already included in the storage.
|
||||
n := raft.RestartNode(c)
|
||||
```
|
||||
|
||||
After creating a Node, the user has a few responsibilities:
|
||||
|
||||
First, read from the Node.Ready() channel and process the updates it contains. These steps may be performed in parallel, except as noted in step 2.
|
||||
|
||||
1. Write Entries, HardState and Snapshot to persistent storage in order, i.e. Entries first, then HardState and Snapshot if they are not empty. If persistent storage supports atomic writes then all of them can be written together. Note that when writing an Entry with Index i, any previously-persisted entries with Index >= i must be discarded.
|
||||
|
||||
2. Send all Messages to the nodes named in the To field. It is important that no messages be sent until the latest HardState has been persisted to disk, and all Entries written by any previous Ready batch (Messages may be sent while entries from the same batch are being persisted). To reduce the I/O latency, an optimization can be applied to make leader write to disk in parallel with its followers (as explained at section 10.2.1 in Raft thesis). If any Message has type MsgSnap, call Node.ReportSnapshot() after it has been sent (these messages may be large). Note: Marshalling messages is not thread-safe; it is important to make sure that no new entries are persisted while marshalling. The easiest way to achieve this is to serialise the messages directly inside the main raft loop.
|
||||
|
||||
3. Apply Snapshot (if any) and CommittedEntries to the state machine. If any committed Entry has Type EntryConfChange, call Node.ApplyConfChange() to apply it to the node. The configuration change may be cancelled at this point by setting the NodeID field to zero before calling ApplyConfChange (but ApplyConfChange must be called one way or the other, and the decision to cancel must be based solely on the state machine and not external information such as the observed health of the node).
|
||||
|
||||
4. Call Node.Advance() to signal readiness for the next batch of updates. This may be done at any time after step 1, although all updates must be processed in the order they were returned by Ready.
|
||||
|
||||
Second, all persisted log entries must be made available via an implementation of the Storage interface. The provided MemoryStorage type can be used for this (if repopulating its state upon a restart), or a custom disk-backed implementation can be supplied.
|
||||
|
||||
Third, after receiving a message from another node, pass it to Node.Step:
|
||||
|
||||
```go
|
||||
func recvRaftRPC(ctx context.Context, m raftpb.Message) {
|
||||
n.Step(ctx, m)
|
||||
}
|
||||
```
|
||||
|
||||
Finally, call `Node.Tick()` at regular intervals (probably via a `time.Ticker`). Raft has two important timeouts: heartbeat and the election timeout. However, internally to the raft package time is represented by an abstract "tick".
|
||||
|
||||
The total state machine handling loop will look something like this:
|
||||
|
||||
```go
|
||||
for {
|
||||
select {
|
||||
case <-s.Ticker:
|
||||
n.Tick()
|
||||
case rd := <-s.Node.Ready():
|
||||
saveToStorage(rd.HardState, rd.Entries, rd.Snapshot)
|
||||
send(rd.Messages)
|
||||
if !raft.IsEmptySnap(rd.Snapshot) {
|
||||
processSnapshot(rd.Snapshot)
|
||||
}
|
||||
for _, entry := range rd.CommittedEntries {
|
||||
process(entry)
|
||||
if entry.Type == raftpb.EntryConfChange {
|
||||
var cc raftpb.ConfChange
|
||||
cc.Unmarshal(entry.Data)
|
||||
s.Node.ApplyConfChange(cc)
|
||||
}
|
||||
}
|
||||
s.Node.Advance()
|
||||
case <-s.done:
|
||||
return
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To propose changes to the state machine from the node to take application data, serialize it into a byte slice and call:
|
||||
|
||||
```go
|
||||
n.Propose(ctx, data)
|
||||
```
|
||||
|
||||
If the proposal is committed, data will appear in committed entries with type raftpb.EntryNormal. There is no guarantee that a proposed command will be committed; the command may have to be reproposed after a timeout.
|
||||
|
||||
To add or remove node in a cluster, build ConfChange struct 'cc' and call:
|
||||
|
||||
```go
|
||||
n.ProposeConfChange(ctx, cc)
|
||||
```
|
||||
|
||||
After config change is committed, some committed entry with type raftpb.EntryConfChange will be returned. This must be applied to node through:
|
||||
|
||||
```go
|
||||
var cc raftpb.ConfChange
|
||||
cc.Unmarshal(data)
|
||||
n.ApplyConfChange(cc)
|
||||
```
|
||||
|
||||
Note: An ID represents a unique node in a cluster for all time. A
|
||||
given ID MUST be used only once even if the old node has been removed.
|
||||
This means that for example IP addresses make poor node IDs since they
|
||||
may be reused. Node IDs must be non-zero.
|
||||
|
||||
## Implementation notes
|
||||
|
||||
This implementation is up to date with the final Raft thesis (https://github.com/ongardie/dissertation/blob/master/stanford.pdf), although this implementation of the membership change protocol differs somewhat from that described in chapter 4. The key invariant that membership changes happen one node at a time is preserved, but in our implementation the membership change takes effect when its entry is applied, not when it is added to the log (so the entry is committed under the old membership instead of the new). This is equivalent in terms of safety, since the old and new configurations are guaranteed to overlap.
|
||||
|
||||
To ensure there is no attempt to commit two membership changes at once by matching log positions (which would be unsafe since they should have different quorum requirements), any proposed membership change is simply disallowed while any uncommitted change appears in the leader's log.
|
||||
|
||||
This approach introduces a problem when removing a member from a two-member cluster: If one of the members dies before the other one receives the commit of the confchange entry, then the member cannot be removed any more since the cluster cannot make progress. For this reason it is highly recommended to use three or more nodes in every cluster.
|
57
vendor/go.etcd.io/etcd/raft/design.md
generated
vendored
Normal file
57
vendor/go.etcd.io/etcd/raft/design.md
generated
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
## Progress
|
||||
|
||||
Progress represents a follower’s progress in the view of the leader. Leader maintains progresses of all followers, and sends `replication message` to the follower based on its progress.
|
||||
|
||||
`replication message` is a `msgApp` with log entries.
|
||||
|
||||
A progress has two attribute: `match` and `next`. `match` is the index of the highest known matched entry. If leader knows nothing about follower’s replication status, `match` is set to zero. `next` is the index of the first entry that will be replicated to the follower. Leader puts entries from `next` to its latest one in next `replication message`.
|
||||
|
||||
A progress is in one of the three state: `probe`, `replicate`, `snapshot`.
|
||||
|
||||
```
|
||||
+--------------------------------------------------------+
|
||||
| send snapshot |
|
||||
| |
|
||||
+---------+----------+ +----------v---------+
|
||||
+---> probe | | snapshot |
|
||||
| | max inflight = 1 <----------------------------------+ max inflight = 0 |
|
||||
| +---------+----------+ +--------------------+
|
||||
| | 1. snapshot success
|
||||
| | (next=snapshot.index + 1)
|
||||
| | 2. snapshot failure
|
||||
| | (no change)
|
||||
| | 3. receives msgAppResp(rej=false&&index>lastsnap.index)
|
||||
| | (match=m.index,next=match+1)
|
||||
receives msgAppResp(rej=true)
|
||||
(next=match+1)| |
|
||||
| |
|
||||
| |
|
||||
| | receives msgAppResp(rej=false&&index>match)
|
||||
| | (match=m.index,next=match+1)
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| +---------v----------+
|
||||
| | replicate |
|
||||
+---+ max inflight = n |
|
||||
+--------------------+
|
||||
```
|
||||
|
||||
When the progress of a follower is in `probe` state, leader sends at most one `replication message` per heartbeat interval. The leader sends `replication message` slowly and probing the actual progress of the follower. A `msgHeartbeatResp` or a `msgAppResp` with reject might trigger the sending of the next `replication message`.
|
||||
|
||||
When the progress of a follower is in `replicate` state, leader sends `replication message`, then optimistically increases `next` to the latest entry sent. This is an optimized state for fast replicating log entries to the follower.
|
||||
|
||||
When the progress of a follower is in `snapshot` state, leader stops sending any `replication message`.
|
||||
|
||||
A newly elected leader sets the progresses of all the followers to `probe` state with `match` = 0 and `next` = last index. The leader slowly (at most once per heartbeat) sends `replication message` to the follower and probes its progress.
|
||||
|
||||
A progress changes to `replicate` when the follower replies with a non-rejection `msgAppResp`, which implies that it has matched the index sent. At this point, leader starts to stream log entries to the follower fast. The progress will fall back to `probe` when the follower replies a rejection `msgAppResp` or the link layer reports the follower is unreachable. We aggressively reset `next` to `match`+1 since if we receive any `msgAppResp` soon, both `match` and `next` will increase directly to the `index` in `msgAppResp`. (We might end up with sending some duplicate entries when aggressively reset `next` too low. see open question)
|
||||
|
||||
A progress changes from `probe` to `snapshot` when the follower falls very far behind and requires a snapshot. After sending `msgSnap`, the leader waits until the success, failure or abortion of the previous snapshot sent. The progress will go back to `probe` after the sending result is applied.
|
||||
|
||||
### Flow Control
|
||||
|
||||
1. limit the max size of message sent per message. Max should be configurable.
|
||||
Lower the cost at probing state as we limit the size per message; lower the penalty when aggressively decreased to a too low `next`
|
||||
|
||||
2. limit the # of in flight messages < N when in `replicate` state. N should be configurable. Most implementation will have a sending buffer on top of its actual network transport layer (not blocking raft node). We want to make sure raft does not overflow that buffer, which can cause message dropping and triggering a bunch of unnecessary resending repeatedly.
|
177
vendor/go.etcd.io/etcd/raft/raftpb/raft.proto
generated
vendored
Normal file
177
vendor/go.etcd.io/etcd/raft/raftpb/raft.proto
generated
vendored
Normal file
@ -0,0 +1,177 @@
|
||||
syntax = "proto2";
|
||||
package raftpb;
|
||||
|
||||
import "gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
option (gogoproto.goproto_enum_prefix_all) = false;
|
||||
|
||||
enum EntryType {
|
||||
EntryNormal = 0;
|
||||
EntryConfChange = 1; // corresponds to pb.ConfChange
|
||||
EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2
|
||||
}
|
||||
|
||||
message Entry {
|
||||
optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
|
||||
optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
|
||||
optional EntryType Type = 1 [(gogoproto.nullable) = false];
|
||||
optional bytes Data = 4;
|
||||
}
|
||||
|
||||
message SnapshotMetadata {
|
||||
optional ConfState conf_state = 1 [(gogoproto.nullable) = false];
|
||||
optional uint64 index = 2 [(gogoproto.nullable) = false];
|
||||
optional uint64 term = 3 [(gogoproto.nullable) = false];
|
||||
}
|
||||
|
||||
message Snapshot {
|
||||
optional bytes data = 1;
|
||||
optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false];
|
||||
}
|
||||
|
||||
enum MessageType {
|
||||
MsgHup = 0;
|
||||
MsgBeat = 1;
|
||||
MsgProp = 2;
|
||||
MsgApp = 3;
|
||||
MsgAppResp = 4;
|
||||
MsgVote = 5;
|
||||
MsgVoteResp = 6;
|
||||
MsgSnap = 7;
|
||||
MsgHeartbeat = 8;
|
||||
MsgHeartbeatResp = 9;
|
||||
MsgUnreachable = 10;
|
||||
MsgSnapStatus = 11;
|
||||
MsgCheckQuorum = 12;
|
||||
MsgTransferLeader = 13;
|
||||
MsgTimeoutNow = 14;
|
||||
MsgReadIndex = 15;
|
||||
MsgReadIndexResp = 16;
|
||||
MsgPreVote = 17;
|
||||
MsgPreVoteResp = 18;
|
||||
}
|
||||
|
||||
message Message {
|
||||
optional MessageType type = 1 [(gogoproto.nullable) = false];
|
||||
optional uint64 to = 2 [(gogoproto.nullable) = false];
|
||||
optional uint64 from = 3 [(gogoproto.nullable) = false];
|
||||
optional uint64 term = 4 [(gogoproto.nullable) = false];
|
||||
optional uint64 logTerm = 5 [(gogoproto.nullable) = false];
|
||||
optional uint64 index = 6 [(gogoproto.nullable) = false];
|
||||
repeated Entry entries = 7 [(gogoproto.nullable) = false];
|
||||
optional uint64 commit = 8 [(gogoproto.nullable) = false];
|
||||
optional Snapshot snapshot = 9 [(gogoproto.nullable) = false];
|
||||
optional bool reject = 10 [(gogoproto.nullable) = false];
|
||||
optional uint64 rejectHint = 11 [(gogoproto.nullable) = false];
|
||||
optional bytes context = 12;
|
||||
}
|
||||
|
||||
message HardState {
|
||||
optional uint64 term = 1 [(gogoproto.nullable) = false];
|
||||
optional uint64 vote = 2 [(gogoproto.nullable) = false];
|
||||
optional uint64 commit = 3 [(gogoproto.nullable) = false];
|
||||
}
|
||||
|
||||
// ConfChangeTransition specifies the behavior of a configuration change with
|
||||
// respect to joint consensus.
|
||||
enum ConfChangeTransition {
|
||||
// Automatically use the simple protocol if possible, otherwise fall back
|
||||
// to ConfChangeJointImplicit. Most applications will want to use this.
|
||||
ConfChangeTransitionAuto = 0;
|
||||
// Use joint consensus unconditionally, and transition out of them
|
||||
// automatically (by proposing a zero configuration change).
|
||||
//
|
||||
// This option is suitable for applications that want to minimize the time
|
||||
// spent in the joint configuration and do not store the joint configuration
|
||||
// in the state machine (outside of InitialState).
|
||||
ConfChangeTransitionJointImplicit = 1;
|
||||
// Use joint consensus and remain in the joint configuration until the
|
||||
// application proposes a no-op configuration change. This is suitable for
|
||||
// applications that want to explicitly control the transitions, for example
|
||||
// to use a custom payload (via the Context field).
|
||||
ConfChangeTransitionJointExplicit = 2;
|
||||
}
|
||||
|
||||
message ConfState {
|
||||
// The voters in the incoming config. (If the configuration is not joint,
|
||||
// then the outgoing config is empty).
|
||||
repeated uint64 voters = 1;
|
||||
// The learners in the incoming config.
|
||||
repeated uint64 learners = 2;
|
||||
// The voters in the outgoing config.
|
||||
repeated uint64 voters_outgoing = 3;
|
||||
// The nodes that will become learners when the outgoing config is removed.
|
||||
// These nodes are necessarily currently in nodes_joint (or they would have
|
||||
// been added to the incoming config right away).
|
||||
repeated uint64 learners_next = 4;
|
||||
// If set, the config is joint and Raft will automatically transition into
|
||||
// the final config (i.e. remove the outgoing config) when this is safe.
|
||||
optional bool auto_leave = 5 [(gogoproto.nullable) = false];
|
||||
}
|
||||
|
||||
enum ConfChangeType {
|
||||
ConfChangeAddNode = 0;
|
||||
ConfChangeRemoveNode = 1;
|
||||
ConfChangeUpdateNode = 2;
|
||||
ConfChangeAddLearnerNode = 3;
|
||||
}
|
||||
|
||||
message ConfChange {
|
||||
optional ConfChangeType type = 2 [(gogoproto.nullable) = false];
|
||||
optional uint64 node_id = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID" ];
|
||||
optional bytes context = 4;
|
||||
|
||||
// NB: this is used only by etcd to thread through a unique identifier.
|
||||
// Ideally it should really use the Context instead. No counterpart to
|
||||
// this field exists in ConfChangeV2.
|
||||
optional uint64 id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ID" ];
|
||||
}
|
||||
|
||||
// ConfChangeSingle is an individual configuration change operation. Multiple
|
||||
// such operations can be carried out atomically via a ConfChangeV2.
|
||||
message ConfChangeSingle {
|
||||
optional ConfChangeType type = 1 [(gogoproto.nullable) = false];
|
||||
optional uint64 node_id = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"];
|
||||
}
|
||||
|
||||
// ConfChangeV2 messages initiate configuration changes. They support both the
|
||||
// simple "one at a time" membership change protocol and full Joint Consensus
|
||||
// allowing for arbitrary changes in membership.
|
||||
//
|
||||
// The supplied context is treated as an opaque payload and can be used to
|
||||
// attach an action on the state machine to the application of the config change
|
||||
// proposal. Note that contrary to Joint Consensus as outlined in the Raft
|
||||
// paper[1], configuration changes become active when they are *applied* to the
|
||||
// state machine (not when they are appended to the log).
|
||||
//
|
||||
// The simple protocol can be used whenever only a single change is made.
|
||||
//
|
||||
// Non-simple changes require the use of Joint Consensus, for which two
|
||||
// configuration changes are run. The first configuration change specifies the
|
||||
// desired changes and transitions the Raft group into the joint configuration,
|
||||
// in which quorum requires a majority of both the pre-changes and post-changes
|
||||
// configuration. Joint Consensus avoids entering fragile intermediate
|
||||
// configurations that could compromise survivability. For example, without the
|
||||
// use of Joint Consensus and running across three availability zones with a
|
||||
// replication factor of three, it is not possible to replace a voter without
|
||||
// entering an intermediate configuration that does not survive the outage of
|
||||
// one availability zone.
|
||||
//
|
||||
// The provided ConfChangeTransition specifies how (and whether) Joint Consensus
|
||||
// is used, and assigns the task of leaving the joint configuration either to
|
||||
// Raft or the application. Leaving the joint configuration is accomplished by
|
||||
// proposing a ConfChangeV2 with only and optionally the Context field
|
||||
// populated.
|
||||
//
|
||||
// For details on Raft membership changes, see:
|
||||
//
|
||||
// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
|
||||
message ConfChangeV2 {
|
||||
optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false];
|
||||
repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false];
|
||||
optional bytes context = 3;
|
||||
}
|
Reference in New Issue
Block a user