diff --git a/docs/metrics.md b/docs/metrics.md index 8f19c8d2b..c4f45b0a5 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -22,8 +22,8 @@ curl -X GET http://10.109.65.142:8080/metrics 2>/dev/null | grep csi csi_liveness 1 ``` -Promethues can be deployed through the promethues operator described [here](https://coreos.com/operators/prometheus/docs/latest/user-guides/getting-started.html). -The [service-monitor](../deploy/service-monitor.yaml) will tell promethues how +Prometheus can be deployed through the prometheus operator described [here](https://coreos.com/operators/prometheus/docs/latest/user-guides/getting-started.html). +The [service-monitor](../deploy/service-monitor.yaml) will tell prometheus how to pull metrics out of CSI. Each CSI pod has a service to expose the endpoint to prometheus. By default, rbd diff --git a/examples/rbd/storageclass.yaml b/examples/rbd/storageclass.yaml index 601a6696a..5d2fc5eba 100644 --- a/examples/rbd/storageclass.yaml +++ b/examples/rbd/storageclass.yaml @@ -125,7 +125,7 @@ parameters: # "file": Enable file encryption on the mounted filesystem # "block": Encrypt RBD block device # When unspecified assume type "block". "file" and "block" are - # mutally exclusive. + # mutually exclusive. # encryptionType: "block" # (optional) Use external key management system for encryption passphrases by diff --git a/internal/csi-addons/cephfs/network_fence.go b/internal/csi-addons/cephfs/network_fence.go index ecef1a4e0..f480939ed 100644 --- a/internal/csi-addons/cephfs/network_fence.go +++ b/internal/csi-addons/cephfs/network_fence.go @@ -111,7 +111,7 @@ func (fcs *FenceControllerServer) UnfenceClusterNetwork( return nil, status.Error(codes.Internal, err.Error()) } - err = nwFence.RemoveNetworkFence(ctx) + err = nwFence.RemoveClientEviction(ctx) if err != nil { return nil, status.Errorf(codes.Internal, "failed to unfence CIDR block %q: %s", nwFence.Cidr, err.Error()) } diff --git a/internal/csi-addons/networkfence/fencing.go b/internal/csi-addons/networkfence/fencing.go index d49c621e3..1f1eff72a 100644 --- a/internal/csi-addons/networkfence/fencing.go +++ b/internal/csi-addons/networkfence/fencing.go @@ -52,6 +52,13 @@ type activeClient struct { Inst string `json:"inst"` } +// IPWithNonce represents the structure of an IP with nonce +// as listed by Ceph OSD blocklist. +type IPWithNonce struct { + IP string `json:"ip"` + Nonce string `json:"nonce"` +} + // NewNetworkFence returns a networkFence struct object from the Network fence/unfence request. func NewNetworkFence( ctx context.Context, @@ -256,9 +263,7 @@ func (ac *activeClient) fetchID() (int, error) { } // AddClientEviction blocks access for all the IPs in the CIDR block -// using client eviction. -// blocks the active clients listed in cidr, and the IPs -// for whom there is no active client present too. +// using client eviction, it also blocks the entire CIDR. func (nf *NetworkFence) AddClientEviction(ctx context.Context) error { evictedIPs := make(map[string]bool) // fetch active clients @@ -269,13 +274,15 @@ func (nf *NetworkFence) AddClientEviction(ctx context.Context) error { // iterate through CIDR blocks and check if any active client matches for _, cidr := range nf.Cidr { for _, client := range activeClients { - clientIP, err := client.fetchIP() + var clientIP string + clientIP, err = client.fetchIP() if err != nil { return fmt.Errorf("error fetching client IP: %w", err) } // check if the clientIP is in the CIDR block if isIPInCIDR(ctx, clientIP, cidr) { - clientID, err := client.fetchID() + var clientID int + clientID, err = client.fetchID() if err != nil { return fmt.Errorf("error fetching client ID: %w", err) } @@ -291,25 +298,10 @@ func (nf *NetworkFence) AddClientEviction(ctx context.Context) error { } } - // blocklist the IPs in CIDR without any active clients - for _, cidr := range nf.Cidr { - // check if the CIDR is evicted - // fetch the list of IPs from a CIDR block - hosts, err := getIPRange(cidr) - if err != nil { - return fmt.Errorf("failed to convert CIDR block %s to corresponding IP range: %w", cidr, err) - } - - // add ceph blocklist for each IP in the range mentioned by the CIDR - for _, host := range hosts { - if evictedIPs[host] { - continue - } - err = nf.addCephBlocklist(ctx, host, false) - if err != nil { - return err - } - } + // add the range based blocklist for CIDR + err = nf.AddNetworkFence(ctx) + if err != nil { + return err } return nil @@ -358,7 +350,8 @@ func GetCIDR(cidrs Cidrs) ([]string, error) { } // removeCephBlocklist removes an IP from ceph osd blocklist. -func (nf *NetworkFence) removeCephBlocklist(ctx context.Context, ip string, useRange bool) error { +// the value of nonce is ignored if useRange is true. +func (nf *NetworkFence) removeCephBlocklist(ctx context.Context, ip, nonce string, useRange bool) error { arg := []string{ "--id", nf.cr.ID, "--keyfile=" + nf.cr.KeyFile, @@ -368,7 +361,15 @@ func (nf *NetworkFence) removeCephBlocklist(ctx context.Context, ip string, useR if useRange { cmd = append(cmd, "range") } - cmd = append(cmd, "rm", ip) + + // If nonce is not empty and we are not using + // range based blocks, we need to add the nonce + if nonce != "" && !useRange { + cmd = append(cmd, "rm", fmt.Sprintf("%s:0/%s", ip, nonce)) + } else { + cmd = append(cmd, "rm", ip) + } + cmd = append(cmd, arg...) _, stdErr, err := util.ExecCommand(ctx, "ceph", cmd...) @@ -396,7 +397,7 @@ func (nf *NetworkFence) RemoveNetworkFence(ctx context.Context) error { // try range blocklist cmd, if invalid fallback to // iterating through IP range. if hasBlocklistRangeSupport { - err := nf.removeCephBlocklist(ctx, cidr, true) + err := nf.removeCephBlocklist(ctx, cidr, "", true) if err == nil { continue } @@ -412,7 +413,11 @@ func (nf *NetworkFence) RemoveNetworkFence(ctx context.Context) error { } // remove ceph blocklist for each IP in the range mentioned by the CIDR for _, host := range hosts { - err := nf.removeCephBlocklist(ctx, host, false) + // 0 is used as nonce here to tell ceph + // to remove the blocklist entry matching: :0/0 + // it is same as telling ceph to remove just the IP + // without specifying any port or nonce with it. + err := nf.removeCephBlocklist(ctx, host, "0", false) if err != nil { return err } @@ -421,3 +426,103 @@ func (nf *NetworkFence) RemoveNetworkFence(ctx context.Context) error { return nil } + +func (nf *NetworkFence) RemoveClientEviction(ctx context.Context) error { + // Remove the CIDR block first + err := nf.RemoveNetworkFence(ctx) + if err != nil { + return err + } + + // Get the ceph blocklist + blocklist, err := nf.getCephBlocklist(ctx) + if err != nil { + return err + } + + // For each CIDR block, remove the IPs in the blocklist + // that fall under the CIDR with nonce + for _, cidr := range nf.Cidr { + hosts := nf.parseBlocklistForCIDR(ctx, blocklist, cidr) + log.DebugLog(ctx, "parsed blocklist for CIDR %s: %+v", cidr, hosts) + + for _, host := range hosts { + err := nf.removeCephBlocklist(ctx, host.IP, host.Nonce, false) + if err != nil { + return err + } + } + } + + return nil +} + +// getCephBlocklist fetches the ceph blocklist and returns it as a string. +func (nf *NetworkFence) getCephBlocklist(ctx context.Context) (string, error) { + arg := []string{ + "--id", nf.cr.ID, + "--keyfile=" + nf.cr.KeyFile, + "-m", nf.Monitors, + } + // FIXME: replace the ceph command with go-ceph API in future + cmd := []string{"osd", "blocklist", "ls"} + cmd = append(cmd, arg...) + stdout, stdErr, err := util.ExecCommandWithTimeout(ctx, 2*time.Minute, "ceph", cmd...) + if err != nil { + return "", fmt.Errorf("failed to get the ceph blocklist: %w, stderr: %q", err, stdErr) + } + + return stdout, nil +} + +// parseBlocklistEntry parses a single entry from the ceph blocklist +// and returns the IPWithNonce. +func (nf *NetworkFence) parseBlocklistEntry(entry string) IPWithNonce { + parts := strings.Fields(entry) + if len(parts) == 0 { + return IPWithNonce{} + } + + ipPortNonce := strings.SplitN(parts[0], "/", 2) + if len(ipPortNonce) != 2 { + return IPWithNonce{} + } + + ipPort := ipPortNonce[0] + nonce := ipPortNonce[1] + + lastColonIndex := strings.LastIndex(ipPortNonce[0], ":") + if lastColonIndex == -1 { + return IPWithNonce{} + } + + if len(ipPort) <= lastColonIndex { + return IPWithNonce{} + } + ip := ipPort[:lastColonIndex] + + return IPWithNonce{IP: ip, Nonce: nonce} +} + +// parseBlocklistForCIDR scans the blocklist for the given CIDR and returns +// the list of IPs that lie within the CIDR along with their nonce. +func (nf *NetworkFence) parseBlocklistForCIDR(ctx context.Context, blocklist, cidr string) []IPWithNonce { + blocklistEntries := strings.Split(blocklist, "\n") + + matchingHosts := make([]IPWithNonce, 0) + for _, entry := range blocklistEntries { + entry = strings.TrimSpace(entry) + + // Skip unrelated ranged blocks and invalid entries + if strings.Contains(entry, "cidr") || !strings.Contains(entry, "/") { + continue + } + + blockedHost := nf.parseBlocklistEntry(entry) + if isIPInCIDR(ctx, blockedHost.IP, cidr) { + matchingHosts = append(matchingHosts, blockedHost) + } + } + + return matchingHosts +} diff --git a/internal/csi-addons/networkfence/fencing_test.go b/internal/csi-addons/networkfence/fencing_test.go index f7bfa797a..4bd7636ac 100644 --- a/internal/csi-addons/networkfence/fencing_test.go +++ b/internal/csi-addons/networkfence/fencing_test.go @@ -17,6 +17,7 @@ limitations under the License. package networkfence import ( + "context" "testing" "github.com/stretchr/testify/require" @@ -138,3 +139,121 @@ func TestFetchID(t *testing.T) { }) } } + +func TestParseBlocklistEntry(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + expected IPWithNonce + }{ + { + name: "Valid IP and nonce", + input: "192.168.1.1:6789/abcdef123456", + expected: IPWithNonce{IP: "192.168.1.1", Nonce: "abcdef123456"}, + }, + { + name: "IPv6 address with full notation", + input: "2001:0db8:0000:0000:0000:8a2e:0370:7334:6789/abc123", + expected: IPWithNonce{IP: "2001:0db8:0000:0000:0000:8a2e:0370:7334", Nonce: "abc123"}, + }, + { + name: "IPv6 address with compressed zeros", + input: "2001:db8::1428:57ab:6789/def456", + expected: IPWithNonce{IP: "2001:db8::1428:57ab", Nonce: "def456"}, + }, + { + name: "IPv6 loopback address", + input: "::1:6789/ghi789", + expected: IPWithNonce{IP: "::1", Nonce: "ghi789"}, + }, + { + name: "IPv6 address with IPv4 mapping", + input: "::ffff:192.0.2.128:6789/jkl012", + expected: IPWithNonce{IP: "::ffff:192.0.2.128", Nonce: "jkl012"}, + }, + { + name: "IP without port", + input: "10.0.0.1/nonce123", + expected: IPWithNonce{}, + }, + { + name: "Extra whitespace", + input: " 172.16.0.1:1234/abc123 extra info ", + expected: IPWithNonce{IP: "172.16.0.1", Nonce: "abc123"}, + }, + } + + nf := &NetworkFence{} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + result := nf.parseBlocklistEntry(tt.input) + require.Equal(t, tt.expected, result) + }) + } +} + +func TestParseBlocklistForCIDR(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + blocklist string + cidr string + expected []IPWithNonce + }{ + { + name: "Single IPv4 in CIDR", + blocklist: `192.168.1.1:0/1234567 expires 2023-07-01 10:00:00.000000 +listed 1 entries`, + cidr: "192.168.1.0/24", + expected: []IPWithNonce{{IP: "192.168.1.1", Nonce: "1234567"}}, + }, + { + name: "Multiple IPv4 in CIDR", + blocklist: `192.168.1.1:0/1234567 expires 2023-07-01 10:00:00.000000 +192.168.1.2:0/7654321 expires 2023-07-01 11:00:00.000000 +192.168.2.1:0/abcdefg expires 2023-07-01 12:00:00.000000 +listed 3 entries`, + cidr: "192.168.1.0/24", + expected: []IPWithNonce{ + {IP: "192.168.1.1", Nonce: "1234567"}, + {IP: "192.168.1.2", Nonce: "7654321"}, + }, + }, + { + name: "IPv6 in CIDR", + blocklist: `2001:db8::1:0/fedcba expires 2023-07-01 10:00:00.000000 +2001:db8::2:0/abcdef expires 2023-07-01 11:00:00.000000 +listed 2 entries`, + cidr: "2001:db8::/64", + expected: []IPWithNonce{{IP: "2001:db8::1", Nonce: "fedcba"}, {IP: "2001:db8::2", Nonce: "abcdef"}}, + }, + { + name: "Empty blocklist", + blocklist: `listed 0 entries`, + cidr: "192.168.1.0/24", + expected: []IPWithNonce{}, + }, + { + name: "No matching IPs", + blocklist: `10.0.0.1:0/1234567 expires 2023-07-01 10:00:00.000000 +listed 1 entries`, + cidr: "192.168.1.0/24", + expected: []IPWithNonce{}, + }, + } + + nf := &NetworkFence{} + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + result := nf.parseBlocklistForCIDR(context.TODO(), tc.blocklist, tc.cidr) + require.Equal(t, tc.expected, result) + }) + } +} diff --git a/internal/health-checker/checker.go b/internal/health-checker/checker.go index 5eef779b5..14322b2d3 100644 --- a/internal/health-checker/checker.go +++ b/internal/health-checker/checker.go @@ -37,7 +37,7 @@ type checker struct { // timeout contains the delay (interval + timeout) timeout time.Duration - // mutex protects against concurrent access to healty, err and + // mutex protects against concurrent access to healthy, err and // lastUpdate mutex *sync.RWMutex diff --git a/internal/util/crushlocation_test.go b/internal/util/crushlocation_test.go index 4eced910c..4567de351 100644 --- a/internal/util/crushlocation_test.go +++ b/internal/util/crushlocation_test.go @@ -60,7 +60,7 @@ func Test_getCrushLocationMap(t *testing.T) { want: map[string]string{"zone": "zone1"}, }, { - name: "multuple matching crushlocation and node labels", + name: "multiple matching crushlocation and node labels", args: input{ crushLocationLabels: "topology.io/zone,topology.io/rack", nodeLabels: map[string]string{