mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 13:15:57 +00:00
6a21cc50f0
This patch introduces a means for syscalls matched in seccomp to notify some other task that a particular filter has been triggered. The motivation for this is primarily for use with containers. For example, if a container does an init_module(), we obviously don't want to load this untrusted code, which may be compiled for the wrong version of the kernel anyway. Instead, we could parse the module image, figure out which module the container is trying to load and load it on the host. As another example, containers cannot mount() in general since various filesystems assume a trusted image. However, if an orchestrator knows that e.g. a particular block device has not been exposed to a container for writing, it want to allow the container to mount that block device (that is, handle the mount for it). This patch adds functionality that is already possible via at least two other means that I know about, both of which involve ptrace(): first, one could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL. Unfortunately this is slow, so a faster version would be to install a filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP. Since ptrace allows only one tracer, if the container runtime is that tracer, users inside the container (or outside) trying to debug it will not be able to use ptrace, which is annoying. It also means that older distributions based on Upstart cannot boot inside containers using ptrace, since upstart itself uses ptrace to monitor services while starting. The actual implementation of this is fairly small, although getting the synchronization right was/is slightly complex. Finally, it's worth noting that the classic seccomp TOCTOU of reading memory data from the task still applies here, but can be avoided with careful design of the userspace handler: if the userspace handler reads all of the task memory that is necessary before applying its security policy, the tracee's subsequent memory edits will not be read by the tracer. Signed-off-by: Tycho Andersen <tycho@tycho.ws> CC: Kees Cook <keescook@chromium.org> CC: Andy Lutomirski <luto@amacapital.net> CC: Oleg Nesterov <oleg@redhat.com> CC: Eric W. Biederman <ebiederm@xmission.com> CC: "Serge E. Hallyn" <serge@hallyn.com> Acked-by: Serge Hallyn <serge@hallyn.com> CC: Christian Brauner <christian@brauner.io> CC: Tyler Hicks <tyhicks@canonical.com> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> Signed-off-by: Kees Cook <keescook@chromium.org>
116 lines
2.9 KiB
C
116 lines
2.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SECCOMP_H
|
|
#define _LINUX_SECCOMP_H
|
|
|
|
#include <uapi/linux/seccomp.h>
|
|
|
|
#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
|
|
SECCOMP_FILTER_FLAG_LOG | \
|
|
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
|
|
SECCOMP_FILTER_FLAG_NEW_LISTENER)
|
|
|
|
#ifdef CONFIG_SECCOMP
|
|
|
|
#include <linux/thread_info.h>
|
|
#include <asm/seccomp.h>
|
|
|
|
struct seccomp_filter;
|
|
/**
|
|
* struct seccomp - the state of a seccomp'ed process
|
|
*
|
|
* @mode: indicates one of the valid values above for controlled
|
|
* system calls available to a process.
|
|
* @filter: must always point to a valid seccomp-filter or NULL as it is
|
|
* accessed without locking during system call entry.
|
|
*
|
|
* @filter must only be accessed from the context of current as there
|
|
* is no read locking.
|
|
*/
|
|
struct seccomp {
|
|
int mode;
|
|
struct seccomp_filter *filter;
|
|
};
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
|
extern int __secure_computing(const struct seccomp_data *sd);
|
|
static inline int secure_computing(const struct seccomp_data *sd)
|
|
{
|
|
if (unlikely(test_thread_flag(TIF_SECCOMP)))
|
|
return __secure_computing(sd);
|
|
return 0;
|
|
}
|
|
#else
|
|
extern void secure_computing_strict(int this_syscall);
|
|
#endif
|
|
|
|
extern long prctl_get_seccomp(void);
|
|
extern long prctl_set_seccomp(unsigned long, void __user *);
|
|
|
|
static inline int seccomp_mode(struct seccomp *s)
|
|
{
|
|
return s->mode;
|
|
}
|
|
|
|
#else /* CONFIG_SECCOMP */
|
|
|
|
#include <linux/errno.h>
|
|
|
|
struct seccomp { };
|
|
struct seccomp_filter { };
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
|
static inline int secure_computing(struct seccomp_data *sd) { return 0; }
|
|
#else
|
|
static inline void secure_computing_strict(int this_syscall) { return; }
|
|
#endif
|
|
|
|
static inline long prctl_get_seccomp(void)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int seccomp_mode(struct seccomp *s)
|
|
{
|
|
return SECCOMP_MODE_DISABLED;
|
|
}
|
|
#endif /* CONFIG_SECCOMP */
|
|
|
|
#ifdef CONFIG_SECCOMP_FILTER
|
|
extern void put_seccomp_filter(struct task_struct *tsk);
|
|
extern void get_seccomp_filter(struct task_struct *tsk);
|
|
#else /* CONFIG_SECCOMP_FILTER */
|
|
static inline void put_seccomp_filter(struct task_struct *tsk)
|
|
{
|
|
return;
|
|
}
|
|
static inline void get_seccomp_filter(struct task_struct *tsk)
|
|
{
|
|
return;
|
|
}
|
|
#endif /* CONFIG_SECCOMP_FILTER */
|
|
|
|
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
|
|
extern long seccomp_get_filter(struct task_struct *task,
|
|
unsigned long filter_off, void __user *data);
|
|
extern long seccomp_get_metadata(struct task_struct *task,
|
|
unsigned long filter_off, void __user *data);
|
|
#else
|
|
static inline long seccomp_get_filter(struct task_struct *task,
|
|
unsigned long n, void __user *data)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
static inline long seccomp_get_metadata(struct task_struct *task,
|
|
unsigned long filter_off,
|
|
void __user *data)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
|
|
#endif /* _LINUX_SECCOMP_H */
|