linux-stable/include/linux/user_namespace.h
Serge E. Hallyn db2e718a47 capabilities: require CAP_SETFCAP to map uid 0
cap_setfcap is required to create file capabilities.

Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"),
a process running as uid 0 but without cap_setfcap is able to work
around this as follows: unshare a new user namespace which maps parent
uid 0 into the child namespace.

While this task will not have new capabilities against the parent
namespace, there is a loophole due to the way namespaced file
capabilities are represented as xattrs.  File capabilities valid in
userns 1 are distinguished from file capabilities valid in userns 2 by
the kuid which underlies uid 0.  Therefore the restricted root process
can unshare a new self-mapping namespace, add a namespaced file
capability onto a file, then use that file capability in the parent
namespace.

To prevent that, do not allow mapping parent uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the
capability for setting file capabilities.

As a further wrinkle: a task can unshare its user namespace, then open
its uid_map file itself, and map (only) its own uid.  In this case we do
not have the credential from before unshare, which was potentially more
restricted.  So, when creating a user namespace, we record whether the
creator had CAP_SETFCAP.  Then we can use that during map_write().

With this patch:

1. Unprivileged user can still unshare -Ur

   ubuntu@caps:~$ unshare -Ur
   root@caps:~# logout

2. Root user can still unshare -Ur

   ubuntu@caps:~$ sudo bash
   root@caps:/home/ubuntu# unshare -Ur
   root@caps:/home/ubuntu# logout

3. Root user without CAP_SETFCAP cannot unshare -Ur:

   root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
   root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
   unable to set CAP_SETFCAP effective capability: Operation not permitted
   root@caps:/home/ubuntu# unshare -Ur
   unshare: write failed /proc/self/uid_map: Operation not permitted

Note: an alternative solution would be to allow uid 0 mappings by
processes without CAP_SETFCAP, but to prevent such a namespace from
writing any file capabilities.  This approach can be seen at [1].

Background history: commit 95ebabde382 ("capabilities: Don't allow
writing ambiguous v3 file capabilities") tried to fix the issue by
preventing v3 fscaps to be written to disk when the root uid would map
to the same uid in nested user namespaces.  This led to regressions for
various workloads.  For example, see [2].  Ultimately this is a valid
use-case we have to support meaning we had to revert this change in
3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing
ambiguous v3 file capabilities")").

Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1]
Link: https://github.com/containers/buildah/issues/3071 [2]
Signed-off-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Andrew G. Morgan <morgan@kernel.org>
Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-04-20 14:28:33 -07:00

189 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H
#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>
#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340
struct uid_gid_extent {
u32 first;
u32 lower_first;
u32 count;
};
struct uid_gid_map { /* 64 bytes -- 1 cache line */
u32 nr_extents;
union {
struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
struct {
struct uid_gid_extent *forward;
struct uid_gid_extent *reverse;
};
};
};
#define USERNS_SETGROUPS_ALLOWED 1UL
#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
struct ucounts;
enum ucount_type {
UCOUNT_USER_NAMESPACES,
UCOUNT_PID_NAMESPACES,
UCOUNT_UTS_NAMESPACES,
UCOUNT_IPC_NAMESPACES,
UCOUNT_NET_NAMESPACES,
UCOUNT_MNT_NAMESPACES,
UCOUNT_CGROUP_NAMESPACES,
UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
#endif
UCOUNT_COUNTS,
};
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
struct uid_gid_map projid_map;
struct user_namespace *parent;
int level;
kuid_t owner;
kgid_t group;
struct ns_common ns;
unsigned long flags;
/* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
* in its effective capability set at the child ns creation time. */
bool parent_could_setfcap;
#ifdef CONFIG_KEYS
/* List of joinable keyrings in this namespace. Modification access of
* these pointers is controlled by keyring_sem. Once
* user_keyring_register is set, it won't be changed, so it can be
* accessed directly with READ_ONCE().
*/
struct list_head keyring_name_list;
struct key *user_keyring_register;
struct rw_semaphore keyring_sem;
#endif
/* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
struct key *persistent_keyring_register;
#endif
struct work_struct work;
#ifdef CONFIG_SYSCTL
struct ctl_table_set set;
struct ctl_table_header *sysctls;
#endif
struct ucounts *ucounts;
int ucount_max[UCOUNT_COUNTS];
} __randomize_layout;
struct ucounts {
struct hlist_node node;
struct user_namespace *ns;
kuid_t uid;
int count;
atomic_t ucount[UCOUNT_COUNTS];
};
extern struct user_namespace init_user_ns;
bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
#ifdef CONFIG_USER_NS
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
refcount_inc(&ns->ns.count);
return ns;
}
extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);
static inline void put_user_ns(struct user_namespace *ns)
{
if (ns && refcount_dec_and_test(&ns->ns.count))
__put_user_ns(ns);
}
struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
return &init_user_ns;
}
static inline int create_user_ns(struct cred *new)
{
return -EINVAL;
}
static inline int unshare_userns(unsigned long unshare_flags,
struct cred **new_cred)
{
if (unshare_flags & CLONE_NEWUSER)
return -EINVAL;
return 0;
}
static inline void put_user_ns(struct user_namespace *ns)
{
}
static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
return true;
}
static inline bool in_userns(const struct user_namespace *ancestor,
const struct user_namespace *child)
{
return true;
}
static inline bool current_in_userns(const struct user_namespace *target_ns)
{
return true;
}
static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
return ERR_PTR(-EPERM);
}
#endif
#endif /* _LINUX_USER_H */