mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-07 14:32:23 +00:00
21ca59b365
Enable unprivileged sandboxes to create their own binfmt_misc mounts. This is based on Laurent's work in [1] but has been significantly reworked to fix various issues we identified in earlier versions. While binfmt_misc can currently only be mounted in the initial user namespace, binary types registered in this binfmt_misc instance are available to all sandboxes (Either by having them installed in the sandbox or by registering the binary type with the F flag causing the interpreter to be opened right away). So binfmt_misc binary types are already delegated to sandboxes implicitly. However, while a sandbox has access to all registered binary types in binfmt_misc a sandbox cannot currently register its own binary types in binfmt_misc. This has prevented various use-cases some of which were already outlined in [1] but we have a range of issues associated with this (cf. [3]-[5] below which are just a small sample). Extend binfmt_misc to be mountable in non-initial user namespaces. Similar to other filesystem such as nfsd, mqueue, and sunrpc we use keyed superblock management. The key determines whether we need to create a new superblock or can reuse an already existing one. We use the user namespace of the mount as key. This means a new binfmt_misc superblock is created once per user namespace creation. Subsequent mounts of binfmt_misc in the same user namespace will mount the same binfmt_misc instance. We explicitly do not create a new binfmt_misc superblock on every binfmt_misc mount as the semantics for load_misc_binary() line up with the keying model. This also allows us to retrieve the relevant binfmt_misc instance based on the caller's user namespace which can be done in a simple (bounded to 32 levels) loop. Similar to the current binfmt_misc semantics allowing access to the binary types in the initial binfmt_misc instance we do allow sandboxes access to their parent's binfmt_misc mounts if they do not have created a separate binfmt_misc instance. Overall, this will unblock the use-cases mentioned below and in general will also allow to support and harden execution of another architecture's binaries in tight sandboxes. For instance, using the unshare binary it possible to start a chroot of another architecture and configure the binfmt_misc interpreter without being root to run the binaries in this chroot and without requiring the host to modify its binary type handlers. Henning had already posted a few experiments in the cover letter at [1]. But here's an additional example where an unprivileged container registers qemu-user-static binary handlers for various binary types in its separate binfmt_misc mount and is then seamlessly able to start containers with a different architecture without affecting the host: root [lxc monitor] /var/snap/lxd/common/lxd/containers f1 1000000 \_ /sbin/init 1000000 \_ /lib/systemd/systemd-journald 1000000 \_ /lib/systemd/systemd-udevd 1000100 \_ /lib/systemd/systemd-networkd 1000101 \_ /lib/systemd/systemd-resolved 1000000 \_ /usr/sbin/cron -f 1000103 \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only 1000000 \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers 1000104 \_ /usr/sbin/rsyslogd -n -iNONE 1000000 \_ /lib/systemd/systemd-logind 1000000 \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220 1000107 \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste 1000000 \_ [lxc monitor] /var/lib/lxc f1-s390x 1100000 \_ /usr/bin/qemu-s390x-static /sbin/init 1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald 1100000 \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f 1100103 \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac 1100000 \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers 1100104 \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE 1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind 1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220 1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220 1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220 1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220 1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220 1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd [1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu [2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied [3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters [4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc [5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11 Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin) Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1) Cc: Sargun Dhillon <sargun@sargun.me> Cc: Serge Hallyn <serge@hallyn.com> Cc: Jann Horn <jannh@google.com> Cc: Henning Schild <henning.schild@siemens.com> Cc: Andrei Vagin <avagin@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Laurent Vivier <laurent@vivier.eu> Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Laurent Vivier <laurent@vivier.eu> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Kees Cook <keescook@chromium.org> --- /* v2 */ - Serge Hallyn <serge@hallyn.com>: - Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a new binary type handler is registered. - Christian Brauner <christian.brauner@ubuntu.com>: - Switch authorship to me. I refused to do that earlier even though Laurent said I should do so because I think it's genuinely bad form. But by now I have changed so many things that it'd be unfair to blame Laurent for any potential bugs in here. - Add more comments that explain what's going on. - Rename functions while changing them to better reflect what they are doing to make the code easier to understand. - In the first version when a specific binary type handler was removed either through a write to the entry's file or all binary type handlers were removed by a write to the binfmt_misc mount's status file all cleanup work happened during inode eviction. That includes removal of the relevant entries from entry list. While that works fine I disliked that model after thinking about it for a bit. Because it means that there was a window were someone has already removed a or all binary handlers but they could still be safely reached from load_misc_binary() when it has managed to take the read_lock() on the entries list while inode eviction was already happening. Again, that perfectly benign but it's cleaner to remove the binary handler from the list immediately meaning that ones the write to then entry's file or the binfmt_misc status file returns the binary type cannot be executed anymore. That gives stronger guarantees to the user.
266 lines
6.3 KiB
C
266 lines
6.3 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* The "user cache".
|
|
*
|
|
* (C) Copyright 1991-2000 Linus Torvalds
|
|
*
|
|
* We have a per-user structure to keep track of how many
|
|
* processes, files etc the user has claimed, in order to be
|
|
* able to have per-user limits for system resources.
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/key.h>
|
|
#include <linux/sched/user.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/export.h>
|
|
#include <linux/user_namespace.h>
|
|
#include <linux/binfmts.h>
|
|
#include <linux/proc_ns.h>
|
|
|
|
#if IS_ENABLED(CONFIG_BINFMT_MISC)
|
|
struct binfmt_misc init_binfmt_misc = {
|
|
.entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
|
|
.enabled = true,
|
|
.entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
|
|
};
|
|
EXPORT_SYMBOL_GPL(init_binfmt_misc);
|
|
#endif
|
|
|
|
/*
|
|
* userns count is 1 for root user, 1 for init_uts_ns,
|
|
* and 1 for... ?
|
|
*/
|
|
struct user_namespace init_user_ns = {
|
|
.uid_map = {
|
|
.nr_extents = 1,
|
|
{
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
},
|
|
.gid_map = {
|
|
.nr_extents = 1,
|
|
{
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
},
|
|
.projid_map = {
|
|
.nr_extents = 1,
|
|
{
|
|
.extent[0] = {
|
|
.first = 0,
|
|
.lower_first = 0,
|
|
.count = 4294967295U,
|
|
},
|
|
},
|
|
},
|
|
.ns.count = REFCOUNT_INIT(3),
|
|
.owner = GLOBAL_ROOT_UID,
|
|
.group = GLOBAL_ROOT_GID,
|
|
.ns.inum = PROC_USER_INIT_INO,
|
|
#ifdef CONFIG_USER_NS
|
|
.ns.ops = &userns_operations,
|
|
#endif
|
|
.flags = USERNS_INIT_FLAGS,
|
|
#ifdef CONFIG_KEYS
|
|
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
|
|
.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
|
|
#endif
|
|
#if IS_ENABLED(CONFIG_BINFMT_MISC)
|
|
.binfmt_misc = &init_binfmt_misc,
|
|
#endif
|
|
};
|
|
EXPORT_SYMBOL_GPL(init_user_ns);
|
|
|
|
/*
|
|
* UID task count cache, to get fast user lookup in "alloc_uid"
|
|
* when changing user ID's (ie setuid() and friends).
|
|
*/
|
|
|
|
#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
|
|
#define UIDHASH_SZ (1 << UIDHASH_BITS)
|
|
#define UIDHASH_MASK (UIDHASH_SZ - 1)
|
|
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
|
|
#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
|
|
|
|
static struct kmem_cache *uid_cachep;
|
|
static struct hlist_head uidhash_table[UIDHASH_SZ];
|
|
|
|
/*
|
|
* The uidhash_lock is mostly taken from process context, but it is
|
|
* occasionally also taken from softirq/tasklet context, when
|
|
* task-structs get RCU-freed. Hence all locking must be softirq-safe.
|
|
* But free_uid() is also called with local interrupts disabled, and running
|
|
* local_bh_enable() with local interrupts disabled is an error - we'll run
|
|
* softirq callbacks, and they can unconditionally enable interrupts, and
|
|
* the caller of free_uid() didn't expect that..
|
|
*/
|
|
static DEFINE_SPINLOCK(uidhash_lock);
|
|
|
|
/* root_user.__count is 1, for init task cred */
|
|
struct user_struct root_user = {
|
|
.__count = REFCOUNT_INIT(1),
|
|
.uid = GLOBAL_ROOT_UID,
|
|
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
|
|
};
|
|
|
|
/*
|
|
* These routines must be called with the uidhash spinlock held!
|
|
*/
|
|
static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
|
|
{
|
|
hlist_add_head(&up->uidhash_node, hashent);
|
|
}
|
|
|
|
static void uid_hash_remove(struct user_struct *up)
|
|
{
|
|
hlist_del_init(&up->uidhash_node);
|
|
}
|
|
|
|
static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
|
|
{
|
|
struct user_struct *user;
|
|
|
|
hlist_for_each_entry(user, hashent, uidhash_node) {
|
|
if (uid_eq(user->uid, uid)) {
|
|
refcount_inc(&user->__count);
|
|
return user;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static int user_epoll_alloc(struct user_struct *up)
|
|
{
|
|
#ifdef CONFIG_EPOLL
|
|
return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
static void user_epoll_free(struct user_struct *up)
|
|
{
|
|
#ifdef CONFIG_EPOLL
|
|
percpu_counter_destroy(&up->epoll_watches);
|
|
#endif
|
|
}
|
|
|
|
/* IRQs are disabled and uidhash_lock is held upon function entry.
|
|
* IRQ state (as stored in flags) is restored and uidhash_lock released
|
|
* upon function exit.
|
|
*/
|
|
static void free_user(struct user_struct *up, unsigned long flags)
|
|
__releases(&uidhash_lock)
|
|
{
|
|
uid_hash_remove(up);
|
|
spin_unlock_irqrestore(&uidhash_lock, flags);
|
|
user_epoll_free(up);
|
|
kmem_cache_free(uid_cachep, up);
|
|
}
|
|
|
|
/*
|
|
* Locate the user_struct for the passed UID. If found, take a ref on it. The
|
|
* caller must undo that ref with free_uid().
|
|
*
|
|
* If the user_struct could not be found, return NULL.
|
|
*/
|
|
struct user_struct *find_user(kuid_t uid)
|
|
{
|
|
struct user_struct *ret;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&uidhash_lock, flags);
|
|
ret = uid_hash_find(uid, uidhashentry(uid));
|
|
spin_unlock_irqrestore(&uidhash_lock, flags);
|
|
return ret;
|
|
}
|
|
|
|
void free_uid(struct user_struct *up)
|
|
{
|
|
unsigned long flags;
|
|
|
|
if (!up)
|
|
return;
|
|
|
|
if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
|
|
free_user(up, flags);
|
|
}
|
|
EXPORT_SYMBOL_GPL(free_uid);
|
|
|
|
struct user_struct *alloc_uid(kuid_t uid)
|
|
{
|
|
struct hlist_head *hashent = uidhashentry(uid);
|
|
struct user_struct *up, *new;
|
|
|
|
spin_lock_irq(&uidhash_lock);
|
|
up = uid_hash_find(uid, hashent);
|
|
spin_unlock_irq(&uidhash_lock);
|
|
|
|
if (!up) {
|
|
new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
|
|
if (!new)
|
|
return NULL;
|
|
|
|
new->uid = uid;
|
|
refcount_set(&new->__count, 1);
|
|
if (user_epoll_alloc(new)) {
|
|
kmem_cache_free(uid_cachep, new);
|
|
return NULL;
|
|
}
|
|
ratelimit_state_init(&new->ratelimit, HZ, 100);
|
|
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
|
|
|
|
/*
|
|
* Before adding this, check whether we raced
|
|
* on adding the same user already..
|
|
*/
|
|
spin_lock_irq(&uidhash_lock);
|
|
up = uid_hash_find(uid, hashent);
|
|
if (up) {
|
|
user_epoll_free(new);
|
|
kmem_cache_free(uid_cachep, new);
|
|
} else {
|
|
uid_hash_insert(new, hashent);
|
|
up = new;
|
|
}
|
|
spin_unlock_irq(&uidhash_lock);
|
|
}
|
|
|
|
return up;
|
|
}
|
|
|
|
static int __init uid_cache_init(void)
|
|
{
|
|
int n;
|
|
|
|
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
|
|
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
|
|
|
|
for(n = 0; n < UIDHASH_SZ; ++n)
|
|
INIT_HLIST_HEAD(uidhash_table + n);
|
|
|
|
if (user_epoll_alloc(&root_user))
|
|
panic("root_user epoll percpu counter alloc failed");
|
|
|
|
/* Insert the root user immediately (init already runs as root) */
|
|
spin_lock_irq(&uidhash_lock);
|
|
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
|
|
spin_unlock_irq(&uidhash_lock);
|
|
|
|
return 0;
|
|
}
|
|
subsys_initcall(uid_cache_init);
|