mirror of
https://github.com/ceph/ceph-csi.git
synced 2025-01-30 16:49:29 +00:00
441 lines
15 KiB
Go
441 lines
15 KiB
Go
|
//go:build linux
|
||
|
|
||
|
// Copyright (C) 2024 SUSE LLC. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package securejoin
|
||
|
|
||
|
import (
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"os"
|
||
|
"runtime"
|
||
|
"strconv"
|
||
|
"sync"
|
||
|
|
||
|
"golang.org/x/sys/unix"
|
||
|
)
|
||
|
|
||
|
func fstat(f *os.File) (unix.Stat_t, error) {
|
||
|
var stat unix.Stat_t
|
||
|
if err := unix.Fstat(int(f.Fd()), &stat); err != nil {
|
||
|
return stat, &os.PathError{Op: "fstat", Path: f.Name(), Err: err}
|
||
|
}
|
||
|
return stat, nil
|
||
|
}
|
||
|
|
||
|
func fstatfs(f *os.File) (unix.Statfs_t, error) {
|
||
|
var statfs unix.Statfs_t
|
||
|
if err := unix.Fstatfs(int(f.Fd()), &statfs); err != nil {
|
||
|
return statfs, &os.PathError{Op: "fstatfs", Path: f.Name(), Err: err}
|
||
|
}
|
||
|
return statfs, nil
|
||
|
}
|
||
|
|
||
|
// The kernel guarantees that the root inode of a procfs mount has an
|
||
|
// f_type of PROC_SUPER_MAGIC and st_ino of PROC_ROOT_INO.
|
||
|
const (
|
||
|
procSuperMagic = 0x9fa0 // PROC_SUPER_MAGIC
|
||
|
procRootIno = 1 // PROC_ROOT_INO
|
||
|
)
|
||
|
|
||
|
func verifyProcRoot(procRoot *os.File) error {
|
||
|
if statfs, err := fstatfs(procRoot); err != nil {
|
||
|
return err
|
||
|
} else if statfs.Type != procSuperMagic {
|
||
|
return fmt.Errorf("%w: incorrect procfs root filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
|
||
|
}
|
||
|
if stat, err := fstat(procRoot); err != nil {
|
||
|
return err
|
||
|
} else if stat.Ino != procRootIno {
|
||
|
return fmt.Errorf("%w: incorrect procfs root inode number %d", errUnsafeProcfs, stat.Ino)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
var hasNewMountApi = sync.OnceValue(func() bool {
|
||
|
// All of the pieces of the new mount API we use (fsopen, fsconfig,
|
||
|
// fsmount, open_tree) were added together in Linux 5.1[1,2], so we can
|
||
|
// just check for one of the syscalls and the others should also be
|
||
|
// available.
|
||
|
//
|
||
|
// Just try to use open_tree(2) to open a file without OPEN_TREE_CLONE.
|
||
|
// This is equivalent to openat(2), but tells us if open_tree is
|
||
|
// available (and thus all of the other basic new mount API syscalls).
|
||
|
// open_tree(2) is most light-weight syscall to test here.
|
||
|
//
|
||
|
// [1]: merge commit 400913252d09
|
||
|
// [2]: <https://lore.kernel.org/lkml/153754740781.17872.7869536526927736855.stgit@warthog.procyon.org.uk/>
|
||
|
fd, err := unix.OpenTree(-int(unix.EBADF), "/", unix.OPEN_TREE_CLOEXEC)
|
||
|
if err != nil {
|
||
|
return false
|
||
|
}
|
||
|
_ = unix.Close(fd)
|
||
|
return true
|
||
|
})
|
||
|
|
||
|
func fsopen(fsName string, flags int) (*os.File, error) {
|
||
|
// Make sure we always set O_CLOEXEC.
|
||
|
flags |= unix.FSOPEN_CLOEXEC
|
||
|
fd, err := unix.Fsopen(fsName, flags)
|
||
|
if err != nil {
|
||
|
return nil, os.NewSyscallError("fsopen "+fsName, err)
|
||
|
}
|
||
|
return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
|
||
|
}
|
||
|
|
||
|
func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
|
||
|
// Make sure we always set O_CLOEXEC.
|
||
|
flags |= unix.FSMOUNT_CLOEXEC
|
||
|
fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
|
||
|
if err != nil {
|
||
|
return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
|
||
|
}
|
||
|
return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
|
||
|
}
|
||
|
|
||
|
func newPrivateProcMount() (*os.File, error) {
|
||
|
procfsCtx, err := fsopen("proc", unix.FSOPEN_CLOEXEC)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
defer procfsCtx.Close()
|
||
|
|
||
|
// Try to configure hidepid=ptraceable,subset=pid if possible, but ignore errors.
|
||
|
_ = unix.FsconfigSetString(int(procfsCtx.Fd()), "hidepid", "ptraceable")
|
||
|
_ = unix.FsconfigSetString(int(procfsCtx.Fd()), "subset", "pid")
|
||
|
|
||
|
// Get an actual handle.
|
||
|
if err := unix.FsconfigCreate(int(procfsCtx.Fd())); err != nil {
|
||
|
return nil, os.NewSyscallError("fsconfig create procfs", err)
|
||
|
}
|
||
|
return fsmount(procfsCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID)
|
||
|
}
|
||
|
|
||
|
func openTree(dir *os.File, path string, flags uint) (*os.File, error) {
|
||
|
dirFd := -int(unix.EBADF)
|
||
|
dirName := "."
|
||
|
if dir != nil {
|
||
|
dirFd = int(dir.Fd())
|
||
|
dirName = dir.Name()
|
||
|
}
|
||
|
// Make sure we always set O_CLOEXEC.
|
||
|
flags |= unix.OPEN_TREE_CLOEXEC
|
||
|
fd, err := unix.OpenTree(dirFd, path, flags)
|
||
|
if err != nil {
|
||
|
return nil, &os.PathError{Op: "open_tree", Path: path, Err: err}
|
||
|
}
|
||
|
return os.NewFile(uintptr(fd), dirName+"/"+path), nil
|
||
|
}
|
||
|
|
||
|
func clonePrivateProcMount() (_ *os.File, Err error) {
|
||
|
// Try to make a clone without using AT_RECURSIVE if we can. If this works,
|
||
|
// we can be sure there are no over-mounts and so if the root is valid then
|
||
|
// we're golden. Otherwise, we have to deal with over-mounts.
|
||
|
procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE)
|
||
|
if err != nil || hookForcePrivateProcRootOpenTreeAtRecursive(procfsHandle) {
|
||
|
procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("creating a detached procfs clone: %w", err)
|
||
|
}
|
||
|
defer func() {
|
||
|
if Err != nil {
|
||
|
_ = procfsHandle.Close()
|
||
|
}
|
||
|
}()
|
||
|
if err := verifyProcRoot(procfsHandle); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
return procfsHandle, nil
|
||
|
}
|
||
|
|
||
|
func privateProcRoot() (*os.File, error) {
|
||
|
if !hasNewMountApi() || hookForceGetProcRootUnsafe() {
|
||
|
return nil, fmt.Errorf("new mount api: %w", unix.ENOTSUP)
|
||
|
}
|
||
|
// Try to create a new procfs mount from scratch if we can. This ensures we
|
||
|
// can get a procfs mount even if /proc is fake (for whatever reason).
|
||
|
procRoot, err := newPrivateProcMount()
|
||
|
if err != nil || hookForcePrivateProcRootOpenTree(procRoot) {
|
||
|
// Try to clone /proc then...
|
||
|
procRoot, err = clonePrivateProcMount()
|
||
|
}
|
||
|
return procRoot, err
|
||
|
}
|
||
|
|
||
|
func unsafeHostProcRoot() (_ *os.File, Err error) {
|
||
|
procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
defer func() {
|
||
|
if Err != nil {
|
||
|
_ = procRoot.Close()
|
||
|
}
|
||
|
}()
|
||
|
if err := verifyProcRoot(procRoot); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
return procRoot, nil
|
||
|
}
|
||
|
|
||
|
func doGetProcRoot() (*os.File, error) {
|
||
|
procRoot, err := privateProcRoot()
|
||
|
if err != nil {
|
||
|
// Fall back to using a /proc handle if making a private mount failed.
|
||
|
// If we have openat2, at least we can avoid some kinds of over-mount
|
||
|
// attacks, but without openat2 there's not much we can do.
|
||
|
procRoot, err = unsafeHostProcRoot()
|
||
|
}
|
||
|
return procRoot, err
|
||
|
}
|
||
|
|
||
|
var getProcRoot = sync.OnceValues(func() (*os.File, error) {
|
||
|
return doGetProcRoot()
|
||
|
})
|
||
|
|
||
|
var hasProcThreadSelf = sync.OnceValue(func() bool {
|
||
|
return unix.Access("/proc/thread-self/", unix.F_OK) == nil
|
||
|
})
|
||
|
|
||
|
var errUnsafeProcfs = errors.New("unsafe procfs detected")
|
||
|
|
||
|
type procThreadSelfCloser func()
|
||
|
|
||
|
// procThreadSelf returns a handle to /proc/thread-self/<subpath> (or an
|
||
|
// equivalent handle on older kernels where /proc/thread-self doesn't exist).
|
||
|
// Once finished with the handle, you must call the returned closer function
|
||
|
// (runtime.UnlockOSThread). You must not pass the returned *os.File to other
|
||
|
// Go threads or use the handle after calling the closer.
|
||
|
//
|
||
|
// This is similar to ProcThreadSelf from runc, but with extra hardening
|
||
|
// applied and using *os.File.
|
||
|
func procThreadSelf(procRoot *os.File, subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
|
||
|
// We need to lock our thread until the caller is done with the handle
|
||
|
// because between getting the handle and using it we could get interrupted
|
||
|
// by the Go runtime and hit the case where the underlying thread is
|
||
|
// swapped out and the original thread is killed, resulting in
|
||
|
// pull-your-hair-out-hard-to-debug issues in the caller.
|
||
|
runtime.LockOSThread()
|
||
|
defer func() {
|
||
|
if Err != nil {
|
||
|
runtime.UnlockOSThread()
|
||
|
}
|
||
|
}()
|
||
|
|
||
|
// Figure out what prefix we want to use.
|
||
|
threadSelf := "thread-self/"
|
||
|
if !hasProcThreadSelf() || hookForceProcSelfTask() {
|
||
|
/// Pre-3.17 kernels don't have /proc/thread-self, so do it manually.
|
||
|
threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/"
|
||
|
if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil || hookForceProcSelf() {
|
||
|
// In this case, we running in a pid namespace that doesn't match
|
||
|
// the /proc mount we have. This can happen inside runc.
|
||
|
//
|
||
|
// Unfortunately, there is no nice way to get the correct TID to
|
||
|
// use here because of the age of the kernel, so we have to just
|
||
|
// use /proc/self and hope that it works.
|
||
|
threadSelf = "self/"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Grab the handle.
|
||
|
var (
|
||
|
handle *os.File
|
||
|
err error
|
||
|
)
|
||
|
if hasOpenat2() {
|
||
|
// We prefer being able to use RESOLVE_NO_XDEV if we can, to be
|
||
|
// absolutely sure we are operating on a clean /proc handle that
|
||
|
// doesn't have any cheeky overmounts that could trick us (including
|
||
|
// symlink mounts on top of /proc/thread-self). RESOLVE_BENEATH isn't
|
||
|
// strictly needed, but just use it since we have it.
|
||
|
//
|
||
|
// NOTE: /proc/self is technically a magic-link (the contents of the
|
||
|
// symlink are generated dynamically), but it doesn't use
|
||
|
// nd_jump_link() so RESOLVE_NO_MAGICLINKS allows it.
|
||
|
//
|
||
|
// NOTE: We MUST NOT use RESOLVE_IN_ROOT here, as openat2File uses
|
||
|
// procSelfFdReadlink to clean up the returned f.Name() if we use
|
||
|
// RESOLVE_IN_ROOT (which would lead to an infinite recursion).
|
||
|
handle, err = openat2File(procRoot, threadSelf+subpath, &unix.OpenHow{
|
||
|
Flags: unix.O_PATH | unix.O_NOFOLLOW | unix.O_CLOEXEC,
|
||
|
Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_MAGICLINKS,
|
||
|
})
|
||
|
if err != nil {
|
||
|
return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
|
||
|
}
|
||
|
} else {
|
||
|
handle, err = openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||
|
if err != nil {
|
||
|
return nil, nil, fmt.Errorf("%w: %w", errUnsafeProcfs, err)
|
||
|
}
|
||
|
defer func() {
|
||
|
if Err != nil {
|
||
|
_ = handle.Close()
|
||
|
}
|
||
|
}()
|
||
|
// We can't detect bind-mounts of different parts of procfs on top of
|
||
|
// /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
|
||
|
// aren't on the wrong filesystem here.
|
||
|
if statfs, err := fstatfs(handle); err != nil {
|
||
|
return nil, nil, err
|
||
|
} else if statfs.Type != procSuperMagic {
|
||
|
return nil, nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
|
||
|
}
|
||
|
}
|
||
|
return handle, runtime.UnlockOSThread, nil
|
||
|
}
|
||
|
|
||
|
var hasStatxMountId = sync.OnceValue(func() bool {
|
||
|
var (
|
||
|
stx unix.Statx_t
|
||
|
// We don't care which mount ID we get. The kernel will give us the
|
||
|
// unique one if it is supported.
|
||
|
wantStxMask uint32 = unix.STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
|
||
|
)
|
||
|
err := unix.Statx(-int(unix.EBADF), "/", 0, int(wantStxMask), &stx)
|
||
|
return err == nil && stx.Mask&wantStxMask != 0
|
||
|
})
|
||
|
|
||
|
func getMountId(dir *os.File, path string) (uint64, error) {
|
||
|
// If we don't have statx(STATX_MNT_ID*) support, we can't do anything.
|
||
|
if !hasStatxMountId() {
|
||
|
return 0, nil
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
stx unix.Statx_t
|
||
|
// We don't care which mount ID we get. The kernel will give us the
|
||
|
// unique one if it is supported.
|
||
|
wantStxMask uint32 = unix.STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
|
||
|
)
|
||
|
|
||
|
err := unix.Statx(int(dir.Fd()), path, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW, int(wantStxMask), &stx)
|
||
|
if stx.Mask&wantStxMask == 0 {
|
||
|
// It's not a kernel limitation, for some reason we couldn't get a
|
||
|
// mount ID. Assume it's some kind of attack.
|
||
|
err = fmt.Errorf("%w: could not get mount id", errUnsafeProcfs)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return 0, &os.PathError{Op: "statx(STATX_MNT_ID_...)", Path: dir.Name() + "/" + path, Err: err}
|
||
|
}
|
||
|
return stx.Mnt_id, nil
|
||
|
}
|
||
|
|
||
|
func checkSymlinkOvermount(procRoot *os.File, dir *os.File, path string) error {
|
||
|
// Get the mntId of our procfs handle.
|
||
|
expectedMountId, err := getMountId(procRoot, "")
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
// Get the mntId of the target magic-link.
|
||
|
gotMountId, err := getMountId(dir, path)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
// As long as the directory mount is alive, even with wrapping mount IDs,
|
||
|
// we would expect to see a different mount ID here. (Of course, if we're
|
||
|
// using unsafeHostProcRoot() then an attaker could change this after we
|
||
|
// did this check.)
|
||
|
if expectedMountId != gotMountId {
|
||
|
return fmt.Errorf("%w: symlink %s/%s has an overmount obscuring the real link (mount ids do not match %d != %d)", errUnsafeProcfs, dir.Name(), path, expectedMountId, gotMountId)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func doRawProcSelfFdReadlink(procRoot *os.File, fd int) (string, error) {
|
||
|
fdPath := fmt.Sprintf("fd/%d", fd)
|
||
|
procFdLink, closer, err := procThreadSelf(procRoot, fdPath)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("get safe /proc/thread-self/%s handle: %w", fdPath, err)
|
||
|
}
|
||
|
defer procFdLink.Close()
|
||
|
defer closer()
|
||
|
|
||
|
// Try to detect if there is a mount on top of the magic-link. Since we use the handle directly
|
||
|
// provide to the closure. If the closure uses the handle directly, this
|
||
|
// should be safe in general (a mount on top of the path afterwards would
|
||
|
// not affect the handle itself) and will definitely be safe if we are
|
||
|
// using privateProcRoot() (at least since Linux 5.12[1], when anonymous
|
||
|
// mount namespaces were completely isolated from external mounts including
|
||
|
// mount propagation events).
|
||
|
//
|
||
|
// [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts
|
||
|
// onto targets that reside on shared mounts").
|
||
|
if err := checkSymlinkOvermount(procRoot, procFdLink, ""); err != nil {
|
||
|
return "", fmt.Errorf("check safety of /proc/thread-self/fd/%d magiclink: %w", fd, err)
|
||
|
}
|
||
|
|
||
|
// readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See Linux commit
|
||
|
// 65cfc6722361 ("readlinkat(), fchownat() and fstatat() with empty
|
||
|
// relative pathnames").
|
||
|
return readlinkatFile(procFdLink, "")
|
||
|
}
|
||
|
|
||
|
func rawProcSelfFdReadlink(fd int) (string, error) {
|
||
|
procRoot, err := getProcRoot()
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
return doRawProcSelfFdReadlink(procRoot, fd)
|
||
|
}
|
||
|
|
||
|
func procSelfFdReadlink(f *os.File) (string, error) {
|
||
|
return rawProcSelfFdReadlink(int(f.Fd()))
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
errPossibleBreakout = errors.New("possible breakout detected")
|
||
|
errInvalidDirectory = errors.New("wandered into deleted directory")
|
||
|
errDeletedInode = errors.New("cannot verify path of deleted inode")
|
||
|
)
|
||
|
|
||
|
func isDeadInode(file *os.File) error {
|
||
|
// If the nlink of a file drops to 0, there is an attacker deleting
|
||
|
// directories during our walk, which could result in weird /proc values.
|
||
|
// It's better to error out in this case.
|
||
|
stat, err := fstat(file)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("check for dead inode: %w", err)
|
||
|
}
|
||
|
if stat.Nlink == 0 {
|
||
|
err := errDeletedInode
|
||
|
if stat.Mode&unix.S_IFMT == unix.S_IFDIR {
|
||
|
err = errInvalidDirectory
|
||
|
}
|
||
|
return fmt.Errorf("%w %q", err, file.Name())
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func checkProcSelfFdPath(path string, file *os.File) error {
|
||
|
if err := isDeadInode(file); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
actualPath, err := procSelfFdReadlink(file)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("get path of handle: %w", err)
|
||
|
}
|
||
|
if actualPath != path {
|
||
|
return fmt.Errorf("%w: handle path %q doesn't match expected path %q", errPossibleBreakout, actualPath, path)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Test hooks used in the procfs tests to verify that the fallback logic works.
|
||
|
// See testing_mocks_linux_test.go and procfs_linux_test.go for more details.
|
||
|
var (
|
||
|
hookForcePrivateProcRootOpenTree = hookDummyFile
|
||
|
hookForcePrivateProcRootOpenTreeAtRecursive = hookDummyFile
|
||
|
hookForceGetProcRootUnsafe = hookDummy
|
||
|
|
||
|
hookForceProcSelfTask = hookDummy
|
||
|
hookForceProcSelf = hookDummy
|
||
|
)
|
||
|
|
||
|
func hookDummy() bool { return false }
|
||
|
func hookDummyFile(_ *os.File) bool { return false }
|