From 4847c0eb663ab431b56cd82c9c2627967f09f2ef Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 15 Aug 2022 22:44:01 -0400 Subject: [PATCH 1/7] lsm: clean up redundant NULL pointer check The implements of {ip,tcp,udp,dccp,sctp,ipv6}_hdr(skb) guarantee that they will never return NULL, and elsewhere users don't do the check as well, so remove the check here. Signed-off-by: Xiu Jianfeng [PM: subject line tweaks] Signed-off-by: Paul Moore --- security/lsm_audit.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 78a278f28e49..75cc3f8d2a42 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -44,9 +44,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct iphdr *ih; ih = ip_hdr(skb); - if (ih == NULL) - return -EINVAL; - ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; @@ -59,8 +56,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); - if (th == NULL) - break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; @@ -68,8 +63,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); - if (uh == NULL) - break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; @@ -77,8 +70,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); - if (dh == NULL) - break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; @@ -86,8 +77,7 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); - if (sh == NULL) - break; + ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; @@ -115,8 +105,6 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, __be16 frag_off; ip6 = ipv6_hdr(skb); - if (ip6 == NULL) - return -EINVAL; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; /* IPv6 can have several extension header before the Transport header From 7cd4c5c2101cb092db00f61f69d24380cf7a0ee8 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:25 -0500 Subject: [PATCH 2/7] security, lsm: Introduce security_create_user_ns() User namespaces are an effective tool to allow programs to run with permission without requiring the need for a program to run as root. User namespaces may also be used as a sandboxing technique. However, attackers sometimes leverage user namespaces as an initial attack vector to perform some exploit. [1,2,3] While it is not the unprivileged user namespace functionality, which causes the kernel to be exploitable, users/administrators might want to more granularly limit or at least monitor how various processes use this functionality, while vulnerable kernel subsystems are being patched. Preventing user namespace already creation comes in a few of forms in order of granularity: 1. /proc/sys/user/max_user_namespaces sysctl 2. Distro specific patch(es) 3. CONFIG_USER_NS To block a task based on its attributes, the LSM hook cred_prepare is a decent candidate for use because it provides more granular control, and it is called before create_user_ns(): cred = prepare_creds() security_prepare_creds() call_int_hook(cred_prepare, ... if (cred) create_user_ns(cred) Since security_prepare_creds() is meant for LSMs to copy and prepare credentials, access control is an unintended use of the hook. [4] Further, security_prepare_creds() will always return a ENOMEM if the hook returns any non-zero error code. This hook also does not handle the clone3 case which requires us to access a user space pointer to know if we're in the CLONE_NEW_USER call path which may be subject to a TOCTTOU attack. Lastly, cred_prepare is called in many call paths, and a targeted hook further limits the frequency of calls which is a beneficial outcome. Therefore introduce a new function security_create_user_ns() with an accompanying userns_create LSM hook. With the new userns_create hook, users will have more control over the observability and access control over user namespace creation. Users should expect that normal operation of user namespaces will behave as usual, and only be impacted when controls are implemented by users or administrators. This hook takes the prepared creds for LSM authors to write policy against. On success, the new namespace is applied to credentials, otherwise an error is returned. Links: 1. https://nvd.nist.gov/vuln/detail/CVE-2022-0492 2. https://nvd.nist.gov/vuln/detail/CVE-2022-25636 3. https://nvd.nist.gov/vuln/detail/CVE-2022-34918 4. https://lore.kernel.org/all/1c4b1c0d-12f6-6e9e-a6a3-cdce7418110c@schaufler-ca.com/ Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: KP Singh Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 4 ++++ include/linux/security.h | 6 ++++++ kernel/user_namespace.c | 5 +++++ security/security.c | 5 +++++ 5 files changed, 21 insertions(+) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 806448173033..aa7272e83626 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -224,6 +224,7 @@ LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) +LSM_HOOK(int, 0, userns_create, const struct cred *cred) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, u32 *secid) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 84a0d7e02176..2e11a2a22ed1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -806,6 +806,10 @@ * security attributes, e.g. for /proc/pid inodes. * @p contains the task_struct for the task. * @inode contains the inode structure for the inode. + * @userns_create: + * Check permission prior to creating a new user namespace. + * @cred points to prepared creds. + * Return 0 if successful, otherwise < 0 error code. * * Security hooks for Netlink messaging. * diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..767802fe9bfa 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -437,6 +437,7 @@ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); void security_task_to_inode(struct task_struct *p, struct inode *inode); +int security_create_user_ns(const struct cred *cred); int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag); void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid); int security_msg_msg_alloc(struct msg_msg *msg); @@ -1194,6 +1195,11 @@ static inline int security_task_prctl(int option, unsigned long arg2, static inline void security_task_to_inode(struct task_struct *p, struct inode *inode) { } +static inline int security_create_user_ns(const struct cred *cred) +{ + return 0; +} + static inline int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5481ba44a8d6..3f464bbda0e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -113,6 +114,10 @@ int create_user_ns(struct cred *new) !kgid_has_mapping(parent_ns, group)) goto fail_dec; + ret = security_create_user_ns(new); + if (ret < 0) + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) diff --git a/security/security.c b/security/security.c index 14d30fec8a00..1e60c4b570ec 100644 --- a/security/security.c +++ b/security/security.c @@ -1909,6 +1909,11 @@ void security_task_to_inode(struct task_struct *p, struct inode *inode) call_void_hook(task_to_inode, p, inode); } +int security_create_user_ns(const struct cred *cred) +{ + return call_int_hook(userns_create, 0, cred); +} + int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, 0, ipcp, flag); From 401e64b3a4af4c7a2f6a00337232a3cf0bb757ed Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:26 -0500 Subject: [PATCH 3/7] bpf-lsm: Make bpf_lsm_userns_create() sleepable Users may want to audit calls to security_create_user_ns() and access user space memory. Also create_user_ns() runs without pagefault_disabled(). Therefore, make bpf_lsm_userns_create() sleepable for mandatory access control policies. Acked-by: Alexei Starovoitov Acked-by: Christian Brauner (Microsoft) Acked-by: KP Singh Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- kernel/bpf/bpf_lsm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fa71d58b7ded..761998fda762 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -335,6 +335,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) +BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) bool bpf_lsm_is_sleepable_hook(u32 btf_id) From d5810139cca39cf2854728b465f8bada4a445302 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:27 -0500 Subject: [PATCH 4/7] selftests/bpf: Add tests verifying bpf lsm userns_create hook The LSM hook userns_create was introduced to provide LSM's an opportunity to block or allow unprivileged user namespace creation. This test serves two purposes: it provides a test eBPF implementation, and tests the hook successfully blocks or allows user namespace creation. This tests 3 cases: 1. Unattached bpf program does not block unpriv user namespace creation. 2. Attached bpf program allows user namespace creation given CAP_SYS_ADMIN privileges. 3. Attached bpf program denies user namespace creation for a user without CAP_SYS_ADMIN. Acked-by: KP Singh Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- .../selftests/bpf/prog_tests/deny_namespace.c | 102 ++++++++++++++++++ .../selftests/bpf/progs/test_deny_namespace.c | 33 ++++++ 2 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/deny_namespace.c create mode 100644 tools/testing/selftests/bpf/progs/test_deny_namespace.c diff --git a/tools/testing/selftests/bpf/prog_tests/deny_namespace.c b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c new file mode 100644 index 000000000000..1bc6241b755b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include "test_deny_namespace.skel.h" +#include +#include "cap_helpers.h" +#include + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +/* negative return value -> some internal error + * positive return value -> userns creation failed + * 0 -> userns creation succeeded + */ +static int create_user_ns(void) +{ + pid_t pid; + + pid = fork(); + if (pid < 0) + return -1; + + if (pid == 0) { + if (unshare(CLONE_NEWUSER)) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + return wait_for_pid(pid); +} + +static void test_userns_create_bpf(void) +{ + __u32 cap_mask = 1ULL << CAP_SYS_ADMIN; + __u64 old_caps = 0; + + cap_enable_effective(cap_mask, &old_caps); + + ASSERT_OK(create_user_ns(), "priv new user ns"); + + cap_disable_effective(cap_mask, &old_caps); + + ASSERT_EQ(create_user_ns(), EPERM, "unpriv new user ns"); + + if (cap_mask & old_caps) + cap_enable_effective(cap_mask, NULL); +} + +static void test_unpriv_userns_create_no_bpf(void) +{ + __u32 cap_mask = 1ULL << CAP_SYS_ADMIN; + __u64 old_caps = 0; + + cap_disable_effective(cap_mask, &old_caps); + + ASSERT_OK(create_user_ns(), "no-bpf unpriv new user ns"); + + if (cap_mask & old_caps) + cap_enable_effective(cap_mask, NULL); +} + +void test_deny_namespace(void) +{ + struct test_deny_namespace *skel = NULL; + int err; + + if (test__start_subtest("unpriv_userns_create_no_bpf")) + test_unpriv_userns_create_no_bpf(); + + skel = test_deny_namespace__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel load")) + goto close_prog; + + err = test_deny_namespace__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto close_prog; + + if (test__start_subtest("userns_create_bpf")) + test_userns_create_bpf(); + + test_deny_namespace__detach(skel); + +close_prog: + test_deny_namespace__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_deny_namespace.c b/tools/testing/selftests/bpf/progs/test_deny_namespace.c new file mode 100644 index 000000000000..09ad5a4ebd1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_deny_namespace.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +struct kernel_cap_struct { + __u32 cap[_LINUX_CAPABILITY_U32S_3]; +} __attribute__((preserve_access_index)); + +struct cred { + struct kernel_cap_struct cap_effective; +} __attribute__((preserve_access_index)); + +char _license[] SEC("license") = "GPL"; + +SEC("lsm.s/userns_create") +int BPF_PROG(test_userns_create, const struct cred *cred, int ret) +{ + struct kernel_cap_struct caps = cred->cap_effective; + int cap_index = CAP_TO_INDEX(CAP_SYS_ADMIN); + __u32 cap_mask = CAP_TO_MASK(CAP_SYS_ADMIN); + + if (ret) + return 0; + + ret = -EPERM; + if (caps.cap[cap_index] & cap_mask) + return 0; + + return -EPERM; +} From ed5d44d42c95e8a13bb54e614d2269c8740667f9 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Mon, 15 Aug 2022 11:20:28 -0500 Subject: [PATCH 5/7] selinux: Implement userns_create hook Unprivileged user namespace creation is an intended feature to enable sandboxing, however this feature is often used to as an initial step to perform a privilege escalation attack. This patch implements a new user_namespace { create } access control permission to restrict which domains allow or deny user namespace creation. This is necessary for system administrators to quickly protect their systems while waiting for vulnerability patches to be applied. This permission can be used in the following way: allow domA_t domA_t : user_namespace { create }; Signed-off-by: Frederick Lawler Signed-off-by: Paul Moore --- security/selinux/hooks.c | 9 +++++++++ security/selinux/include/classmap.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 79573504783b..b9f1078450b3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4221,6 +4221,14 @@ static void selinux_task_to_inode(struct task_struct *p, spin_unlock(&isec->lock); } +static int selinux_userns_create(const struct cred *cred) +{ + u32 sid = current_sid(); + + return avc_has_perm(&selinux_state, sid, sid, SECCLASS_USER_NAMESPACE, + USER_NAMESPACE__CREATE, NULL); +} + /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) @@ -7111,6 +7119,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(task_movememory, selinux_task_movememory), LSM_HOOK_INIT(task_kill, selinux_task_kill), LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode), + LSM_HOOK_INIT(userns_create, selinux_userns_create), LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission), LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid), diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index ff757ae5f253..0bff55bb9cde 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -254,6 +254,8 @@ const struct security_class_mapping secclass_map[] = { { COMMON_FILE_PERMS, NULL } }, { "io_uring", { "override_creds", "sqpoll", NULL } }, + { "user_namespace", + { "create", NULL } }, { NULL } }; From abec3d015fdfb7c63105c7e1c956188bf381aa55 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 8 Jul 2022 11:34:51 +0200 Subject: [PATCH 6/7] userfaultfd: open userfaultfds with O_RDONLY Since userfaultfd doesn't implement a write operation, it is more appropriate to open it read-only. When userfaultfds are opened read-write like it is now, and such fd is passed from one process to another, SELinux will check both read and write permissions for the target process, even though it can't actually do any write operation on the fd later. Inspired by the following bug report, which has hit the SELinux scenario described above: https://bugzilla.redhat.com/show_bug.cgi?id=1974559 Reported-by: Robert O'Callahan Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Ondrej Mosnacek Acked-by: Peter Xu Acked-by: Christian Brauner (Microsoft) Signed-off-by: Paul Moore --- fs/userfaultfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c44bf75f916..e6ffe7bc59e3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -991,7 +991,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, int fd; fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -2090,7 +2090,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) mmgrab(ctx->mm); fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); From 1e7d8bcbe37d3c63babe628443f13f77970dd06b Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 8 Sep 2022 17:02:22 -0500 Subject: [PATCH 7/7] lockdown: ratelimit denial messages User space can flood the log with lockdown denial messages: [ 662.555584] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 [ 662.563237] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 [ 662.571134] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 [ 662.578668] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 [ 662.586021] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 [ 662.593398] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7 Ratelimiting these shouldn't meaningfully degrade the quality of the information logged. Signed-off-by: Nathan Lynch Signed-off-by: Paul Moore --- security/lockdown/lockdown.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 87cbdc64d272..a79b985e917e 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -63,7 +63,7 @@ static int lockdown_is_locked_down(enum lockdown_reason what) if (kernel_locked_down >= what) { if (lockdown_reasons[what]) - pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", + pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", current->comm, lockdown_reasons[what]); return -EPERM; }