Merge branch 'vfs-6.14.pidfs' into vfs.all

This commit is contained in:
Christian Brauner 2025-01-10 16:17:42 +01:00
commit e206d842e9
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
16 changed files with 1110 additions and 183 deletions

View File

@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
return 0;
}
enum handle_to_path_flags {
HANDLE_CHECK_PERMS = (1 << 0),
HANDLE_CHECK_SUBTREE = (1 << 1),
};
struct handle_to_path_ctx {
struct path root;
enum handle_to_path_flags flags;
unsigned int fh_flags;
};
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
struct handle_to_path_ctx *ctx = context;
@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
{
int handle_dwords;
struct vfsmount *mnt = ctx->root.mnt;
struct dentry *dentry;
/* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2;
path->dentry = exportfs_decode_fh_raw(mnt,
(struct fid *)handle->f_handle,
handle_dwords, handle->handle_type,
ctx->fh_flags,
vfs_dentry_acceptable, ctx);
if (IS_ERR_OR_NULL(path->dentry)) {
if (path->dentry == ERR_PTR(-ENOMEM))
dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
handle_dwords, handle->handle_type,
ctx->fh_flags, vfs_dentry_acceptable,
ctx);
if (IS_ERR_OR_NULL(dentry)) {
if (dentry == ERR_PTR(-ENOMEM))
return -ENOMEM;
return -ESTALE;
}
path->dentry = dentry;
path->mnt = mntget(mnt);
return 0;
}
/*
* Allow relaxed permissions of file handles if the caller has the
* ability to mount the filesystem or create a bind-mount of the
* provided @mountdirfd.
*
* In both cases the caller may be able to get an unobstructed way to
* the encoded file handle. If the caller is only able to create a
* bind-mount we need to verify that there are no locked mounts on top
* of it that could prevent us from getting to the encoded file.
*
* In principle, locked mounts can prevent the caller from mounting the
* filesystem but that only applies to procfs and sysfs neither of which
* support decoding file handles.
*/
static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
unsigned int o_flags)
static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
unsigned int o_flags)
{
struct path *root = &ctx->root;
if (capable(CAP_DAC_READ_SEARCH))
return 0;
/*
* Restrict to O_DIRECTORY to provide a deterministic API that avoids a
* confusing api in the face of disconnected non-dir dentries.
* Allow relaxed permissions of file handles if the caller has
* the ability to mount the filesystem or create a bind-mount of
* the provided @mountdirfd.
*
* In both cases the caller may be able to get an unobstructed
* way to the encoded file handle. If the caller is only able to
* create a bind-mount we need to verify that there are no
* locked mounts on top of it that could prevent us from getting
* to the encoded file.
*
* In principle, locked mounts can prevent the caller from
* mounting the filesystem but that only applies to procfs and
* sysfs neither of which support decoding file handles.
*
* Restrict to O_DIRECTORY to provide a deterministic API that
* avoids a confusing api in the face of disconnected non-dir
* dentries.
*
* There's only one dentry for each directory inode (VFS rule)...
*/
if (!(o_flags & O_DIRECTORY))
return false;
return -EPERM;
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS;
@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
!has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else
return false;
return -EPERM;
/* Are we able to override DAC permissions? */
if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
return false;
return -EPERM;
ctx->fh_flags = EXPORT_FH_DIR_ONLY;
return true;
return 0;
}
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
struct file_handle f_handle;
struct file_handle *handle = NULL;
struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
retval = get_path_from_fd(mountdirfd, &ctx.root);
if (retval)
goto out_err;
if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
retval = -EPERM;
eops = ctx.root.mnt->mnt_sb->s_export_op;
if (eops && eops->permission)
retval = eops->permission(&ctx, o_flags);
else
retval = may_decode_fh(&ctx, o_flags);
if (retval)
goto out_path;
}
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
int open_flag)
{
long retval = 0;
struct path path;
struct path path __free(path_put) = {};
struct file *file;
int fd;
const struct export_operations *eops;
retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
fd = get_unused_fd_flags(open_flag);
if (fd < 0) {
path_put(&path);
CLASS(get_unused_fd, fd)(O_CLOEXEC);
if (fd < 0)
return fd;
}
file = file_open_root(&path, "", open_flag, 0);
if (IS_ERR(file)) {
put_unused_fd(fd);
retval = PTR_ERR(file);
} else {
retval = fd;
fd_install(fd, file);
}
path_put(&path);
return retval;
eops = path.mnt->mnt_sb->s_export_op;
if (eops->open)
file = eops->open(&path, open_flag);
else
file = file_open_root(&path, "", open_flag, 0);
if (IS_ERR(file))
return PTR_ERR(file);
fd_install(fd, file);
return take_fd(fd);
}
/**

View File

@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = ctx->magic;
s->s_op = ctx->ops ?: &simple_super_operations;
s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
root = new_inode(s);

View File

@ -32,6 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
#include <linux/nospec.h>
#include "pnode.h"
@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
return mnt;
if (!check_mnt(old)) {
const struct dentry_operations *d_op = old_path->dentry->d_op;
if (d_op != &ns_dentry_operations &&
d_op != &pidfs_dentry_operations)
return mnt;
}
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;

View File

@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/anon_inodes.h>
#include <linux/exportfs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/cgroup.h>
@ -23,6 +24,97 @@
#include "internal.h"
#include "mount.h"
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
static inline unsigned long pidfs_ino(u64 ino)
{
return lower_32_bits(ino);
}
/* On 32 bit the generation number are the upper 32 bits. */
static inline u32 pidfs_gen(u64 ino)
{
return upper_32_bits(ino);
}
#else
/* On 64 bit simply return ino. */
static inline unsigned long pidfs_ino(u64 ino)
{
return ino;
}
/* On 64 bit the generation number is 0. */
static inline u32 pidfs_gen(u64 ino)
{
return 0;
}
#endif
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
{
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
u64 pid_ino_a = pid_a->ino;
u64 pid_ino_b = pid_b->ino;
if (pid_ino_a < pid_ino_b)
return -1;
if (pid_ino_a > pid_ino_b)
return 1;
return 0;
}
void pidfs_add_pid(struct pid *pid)
{
static u64 pidfs_ino_nr = 2;
/*
* On 64 bit nothing special happens. The 64bit number assigned
* to struct pid is the inode number.
*
* On 32 bit the 64 bit number assigned to struct pid is split
* into two 32 bit numbers. The lower 32 bits are used as the
* inode number and the upper 32 bits are used as the inode
* generation number.
*
* On 32 bit pidfs_ino() will return the lower 32 bit. When
* pidfs_ino() returns zero a wrap around happened. When a
* wraparound happens the 64 bit number will be incremented by 2
* so inode numbering starts at 2 again.
*
* On 64 bit comparing two pidfds is as simple as comparing
* inode numbers.
*
* When a wraparound happens on 32 bit multiple pidfds with the
* same inode number are likely to exist (This isn't a problem
* since before pidfs pidfds used the anonymous inode meaning
* all pidfds had the same inode number.). Userspace can
* reconstruct the 64 bit identifier by retrieving both the
* inode number and the inode generation number to compare or
* use file handles.
*/
if (pidfs_ino(pidfs_ino_nr) == 0)
pidfs_ino_nr += 2;
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
write_seqcount_end(&pidmap_lock_seq);
}
void pidfs_remove_pid(struct pid *pid)
{
write_seqcount_begin(&pidmap_lock_seq);
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
write_seqcount_end(&pidmap_lock_seq);
}
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
return 0;
}
static bool pidfs_ioctl_valid(unsigned int cmd)
{
switch (cmd) {
case FS_IOC_GETVERSION:
case PIDFD_GET_CGROUP_NAMESPACE:
case PIDFD_GET_INFO:
case PIDFD_GET_IPC_NAMESPACE:
case PIDFD_GET_MNT_NAMESPACE:
case PIDFD_GET_NET_NAMESPACE:
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
case PIDFD_GET_TIME_NAMESPACE:
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
case PIDFD_GET_UTS_NAMESPACE:
case PIDFD_GET_USER_NAMESPACE:
case PIDFD_GET_PID_NAMESPACE:
return true;
}
return false;
}
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
if (!pidfs_ioctl_valid(cmd))
return -ENOIOCTLCMD;
if (cmd == FS_IOC_GETVERSION) {
if (!arg)
return -EINVAL;
__u32 __user *argp = (__u32 __user *)arg;
return put_user(file_inode(file)->i_generation, argp);
}
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file)
static struct vfsmount *pidfs_mnt __ro_after_init;
#if BITS_PER_LONG == 32
/*
* Provide a fallback mechanism for 32-bit systems so processes remain
* reliably comparable by inode number even on those systems.
*/
static DEFINE_IDA(pidfd_inum_ida);
static int pidfs_inum(struct pid *pid, unsigned long *ino)
{
int ret;
ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
UINT_MAX, GFP_ATOMIC);
if (ret < 0)
return -ENOSPC;
*ino = ret;
return 0;
}
static inline void pidfs_free_inum(unsigned long ino)
{
if (ino > 0)
ida_free(&pidfd_inum_ida, ino);
}
#else
static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
{
*ino = pid->ino;
return 0;
}
#define pidfs_free_inum(ino) ((void)(ino))
#endif
/*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean
@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode)
clear_inode(inode);
put_pid(pid);
pidfs_free_inum(inode->i_ino);
}
static const struct super_operations pidfs_sops = {
@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
static const struct dentry_operations pidfs_dentry_operations = {
const struct dentry_operations pidfs_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = pidfs_dname,
.d_prune = stashed_dentry_prune,
};
static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
struct inode *parent)
{
const struct pid *pid = inode->i_private;
if (*max_len < 2) {
*max_len = 2;
return FILEID_INVALID;
}
*max_len = 2;
*(u64 *)fh = pid->ino;
return FILEID_KERNFS;
}
static int pidfs_ino_find(const void *key, const struct rb_node *node)
{
const u64 pid_ino = *(u64 *)key;
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
if (pid_ino < pid->ino)
return -1;
if (pid_ino > pid->ino)
return 1;
return 0;
}
/* Find a struct pid based on the inode number. */
static struct pid *pidfs_ino_get_pid(u64 ino)
{
struct pid *pid;
struct rb_node *node;
unsigned int seq;
guard(rcu)();
do {
seq = read_seqcount_begin(&pidmap_lock_seq);
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
if (node)
break;
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
if (!node)
return NULL;
pid = rb_entry(node, struct pid, pidfs_node);
/* Within our pid namespace hierarchy? */
if (pid_vnr(pid) == 0)
return NULL;
return get_pid(pid);
}
static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len,
int fh_type)
{
int ret;
u64 pid_ino;
struct path path;
struct pid *pid;
if (fh_len < 2)
return NULL;
switch (fh_type) {
case FILEID_KERNFS:
pid_ino = *(u64 *)fid;
break;
default:
return NULL;
}
pid = pidfs_ino_get_pid(pid_ino);
if (!pid)
return NULL;
ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
if (ret < 0)
return ERR_PTR(ret);
mntput(path.mnt);
return path.dentry;
}
/*
* Make sure that we reject any nonsensical flags that users pass via
* open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
* PIDFD_NONBLOCK as O_NONBLOCK.
*/
#define VALID_FILE_HANDLE_OPEN_FLAGS \
(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
unsigned int oflags)
{
if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
return -EINVAL;
/*
* pidfd_ino_get_pid() will verify that the struct pid is part
* of the caller's pid namespace hierarchy. No further
* permission checks are needed.
*/
return 0;
}
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
*/
oflags &= ~O_LARGEFILE;
return dentry_open(path, oflags | O_RDWR, current_cred());
}
static const struct export_operations pidfs_export_operations = {
.encode_fh = pidfs_encode_fh,
.fh_to_dentry = pidfs_fh_to_dentry,
.open = pidfs_export_open,
.permission = pidfs_export_permission,
};
static int pidfs_init_inode(struct inode *inode, void *data)
{
const struct pid *pid = data;
inode->i_private = data;
inode->i_flags |= S_PRIVATE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
/*
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This
* avoids collisions with the root inode which is 1 for pseudo
* filesystems.
*/
return pidfs_inum(data, &inode->i_ino);
inode->i_ino = pidfs_ino(pid->ino);
inode->i_generation = pidfs_gen(pid->ino);
return 0;
}
static void pidfs_put_data(void *data)
@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;

View File

@ -3,6 +3,7 @@
#define LINUX_EXPORTFS_H 1
#include <linux/types.h>
#include <linux/path.h>
struct dentry;
struct iattr;
@ -156,6 +157,17 @@ struct fid {
};
};
enum handle_to_path_flags {
HANDLE_CHECK_PERMS = (1 << 0),
HANDLE_CHECK_SUBTREE = (1 << 1),
};
struct handle_to_path_ctx {
struct path root;
enum handle_to_path_flags flags;
unsigned int fh_flags;
};
#define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */
#define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */
#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */
@ -225,6 +237,12 @@ struct fid {
* is also a directory. In the event that it cannot be found, or storage
* space cannot be allocated, a %ERR_PTR should be returned.
*
* permission:
* Allow filesystems to specify a custom permission function.
*
* open:
* Allow filesystems to specify a custom open function.
*
* commit_metadata:
* @commit_metadata should commit metadata changes to stable storage.
*
@ -251,6 +269,8 @@ struct export_operations {
bool write, u32 *device_generation);
int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
int nr_iomaps, struct iattr *iattr);
int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
struct file * (*open)(struct path *path, unsigned int oflags);
#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */
#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */
#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */

View File

@ -59,6 +59,7 @@ struct pid
spinlock_t lock;
struct dentry *stashed;
u64 ino;
struct rb_node pidfs_node;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
@ -68,6 +69,7 @@ struct pid
struct upid numbers[];
};
extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;
struct file;

View File

@ -4,5 +4,8 @@
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
void pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
extern const struct dentry_operations pidfs_dentry_operations;
#endif /* _LINUX_PID_FS_H */

View File

@ -5,6 +5,7 @@
struct pseudo_fs_context {
const struct super_operations *ops;
const struct export_operations *eops;
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;

View File

@ -43,6 +43,7 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@ -64,11 +65,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
* Pseudo filesystems start inode numbering after one. We use Reserved
* PIDs as a natural offset.
*/
static u64 pidfs_ino = RESERVED_PIDS;
/*
* PID-map pages start out as NULL, they get allocated upon
@ -108,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@ -158,6 +155,7 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
pidfs_remove_pid(pid);
spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
pid->stashed = NULL;
pid->ino = ++pidfs_ino;
pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns);
out_free:

View File

@ -6,3 +6,5 @@ pidfd_wait
pidfd_fdinfo_test
pidfd_getfd_test
pidfd_setns_test
pidfd_file_handle_test
pidfd_bind_mount

View File

@ -2,7 +2,8 @@
CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
pidfd_file_handle_test pidfd_bind_mount
include ../lib.mk

View File

@ -17,6 +17,7 @@
#include <sys/wait.h>
#include "../kselftest.h"
#include "../clone3/clone3_selftests.h"
#ifndef P_PIDFD
#define P_PIDFD 3
@ -68,6 +69,11 @@
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options)
{
return syscall(__NR_waitid, which, pid, info, options, NULL);
}
static inline int wait_for_pid(pid_t pid)
{
int status, ret;
@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags)
return syscall(__NR_memfd_create, name, flags);
}
static inline pid_t create_child(int *pidfd, unsigned flags)
{
struct __clone_args args = {
.flags = CLONE_PIDFD | flags,
.exit_signal = SIGCHLD,
.pidfd = ptr_to_u64(pidfd),
};
return sys_clone3(&args, sizeof(struct __clone_args));
}
static inline ssize_t read_nointr(int fd, void *buf, size_t count)
{
ssize_t ret;
do {
ret = read(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
static inline ssize_t write_nointr(int fd, const void *buf, size_t count)
{
ssize_t ret;
do {
ret = write(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
#endif /* __PIDFD_H */

View File

@ -0,0 +1,188 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
#define _GNU_SOURCE
#include <fcntl.h>
#include <limits.h>
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <linux/fs.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <unistd.h>
#include "pidfd.h"
#include "../kselftest_harness.h"
#ifndef __NR_open_tree
#if defined __alpha__
#define __NR_open_tree 538
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_open_tree 4428
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_open_tree 6428
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_open_tree 5428
#endif
#elif defined __ia64__
#define __NR_open_tree (428 + 1024)
#else
#define __NR_open_tree 428
#endif
#endif
#ifndef __NR_move_mount
#if defined __alpha__
#define __NR_move_mount 539
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_move_mount 4429
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_move_mount 6429
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_move_mount 5429
#endif
#elif defined __ia64__
#define __NR_move_mount (428 + 1024)
#else
#define __NR_move_mount 429
#endif
#endif
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
#endif
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
#endif
static inline int sys_move_mount(int from_dfd, const char *from_pathname,
int to_dfd, const char *to_pathname,
unsigned int flags)
{
return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
to_pathname, flags);
}
#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif
#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif
#ifndef AT_RECURSIVE
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
#endif
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}
FIXTURE(pidfd_bind_mount) {
char template[PATH_MAX];
int fd_tmp;
int pidfd;
struct stat st1;
struct stat st2;
__u32 gen1;
__u32 gen2;
bool must_unmount;
};
FIXTURE_SETUP(pidfd_bind_mount)
{
self->fd_tmp = -EBADF;
self->must_unmount = false;
ASSERT_EQ(unshare(CLONE_NEWNS), 0);
ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX);
self->fd_tmp = mkstemp(self->template);
ASSERT_GE(self->fd_tmp, 0);
self->pidfd = sys_pidfd_open(getpid(), 0);
ASSERT_GE(self->pidfd, 0);
ASSERT_GE(fstat(self->pidfd, &self->st1), 0);
ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0);
}
FIXTURE_TEARDOWN(pidfd_bind_mount)
{
ASSERT_EQ(close(self->fd_tmp), 0);
if (self->must_unmount)
ASSERT_EQ(umount2(self->template, 0), 0);
ASSERT_EQ(unlink(self->template), 0);
}
/*
* Test that a detached mount can be created for a pidfd and then
* attached to the filesystem hierarchy.
*/
TEST_F(pidfd_bind_mount, bind_mount)
{
int fd_tree;
fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
ASSERT_GE(fd_tree, 0);
ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
self->must_unmount = true;
ASSERT_EQ(close(fd_tree), 0);
}
/* Test that a pidfd can be reopened through procfs. */
TEST_F(pidfd_bind_mount, reopen)
{
int pidfd;
char proc_path[PATH_MAX];
sprintf(proc_path, "/proc/self/fd/%d", self->pidfd);
pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
ASSERT_GE(fstat(self->pidfd, &self->st2), 0);
ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0);
ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
ASSERT_TRUE(self->gen1 == self->gen2);
ASSERT_EQ(close(pidfd), 0);
}
/*
* Test that a detached mount can be created for a pidfd and then
* attached to the filesystem hierarchy and reopened.
*/
TEST_F(pidfd_bind_mount, bind_mount_reopen)
{
int fd_tree, fd_pidfd_mnt;
fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
ASSERT_GE(fd_tree, 0);
ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
self->must_unmount = true;
fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC);
ASSERT_GE(fd_pidfd_mnt, 0);
ASSERT_GE(fstat(fd_tree, &self->st2), 0);
ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0);
ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
ASSERT_TRUE(self->gen1 == self->gen2);
ASSERT_EQ(close(fd_tree), 0);
ASSERT_EQ(close(fd_pidfd_mnt), 0);
}
TEST_HARNESS_MAIN

View File

@ -0,0 +1,503 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <linux/types.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/socket.h>
#include <linux/kcmp.h>
#include <sys/stat.h>
#include "pidfd.h"
#include "../kselftest_harness.h"
FIXTURE(file_handle)
{
pid_t pid;
int pidfd;
pid_t child_pid1;
int child_pidfd1;
pid_t child_pid2;
int child_pidfd2;
pid_t child_pid3;
int child_pidfd3;
};
FIXTURE_SETUP(file_handle)
{
int ret;
int ipc_sockets[2];
char c;
self->pid = getpid();
self->pidfd = sys_pidfd_open(self->pid, 0);
ASSERT_GE(self->pidfd, 0);
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
EXPECT_GE(self->child_pid1, 0);
if (self->child_pid1 == 0) {
close(ipc_sockets[0]);
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
close(ipc_sockets[1]);
pause();
_exit(EXIT_SUCCESS);
}
close(ipc_sockets[1]);
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
close(ipc_sockets[0]);
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
EXPECT_GE(self->child_pid2, 0);
if (self->child_pid2 == 0) {
close(ipc_sockets[0]);
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
close(ipc_sockets[1]);
pause();
_exit(EXIT_SUCCESS);
}
close(ipc_sockets[1]);
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
close(ipc_sockets[0]);
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID);
EXPECT_GE(self->child_pid3, 0);
if (self->child_pid3 == 0) {
close(ipc_sockets[0]);
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
close(ipc_sockets[1]);
pause();
_exit(EXIT_SUCCESS);
}
close(ipc_sockets[1]);
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
close(ipc_sockets[0]);
}
FIXTURE_TEARDOWN(file_handle)
{
EXPECT_EQ(close(self->pidfd), 0);
EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0);
if (self->child_pidfd1 >= 0)
EXPECT_EQ(0, close(self->child_pidfd1));
EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0);
if (self->child_pidfd2 >= 0)
EXPECT_EQ(0, close(self->child_pidfd2));
EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
if (self->child_pidfd3 >= 0) {
EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0);
EXPECT_EQ(0, close(self->child_pidfd3));
EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
}
}
/*
* Test that we can decode a pidfs file handle in the same pid
* namespace.
*/
TEST_F(file_handle, file_handle_same_pidns)
{
int mnt_id;
struct file_handle *fh;
int pidfd = -EBADF;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
free(fh);
}
/*
* Test that we can decode a pidfs file handle from a child pid
* namespace.
*/
TEST_F(file_handle, file_handle_child_pidns)
{
int mnt_id;
struct file_handle *fh;
int pidfd = -EBADF;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
free(fh);
}
/*
* Test that we fail to decode a pidfs file handle from an ancestor
* child pid namespace.
*/
TEST_F(file_handle, file_handle_foreign_pidns)
{
int mnt_id;
struct file_handle *fh;
pid_t pid;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0);
pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
int pidfd = open_by_handle_at(self->pidfd, fh, 0);
if (pidfd >= 0) {
TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy");
_exit(1);
}
_exit(0);
}
ASSERT_EQ(wait_for_pid(pid), 0);
free(fh);
}
/*
* Test that we can decode a pidfs file handle of a process that has
* exited but not been reaped.
*/
TEST_F(file_handle, pid_has_exited)
{
int mnt_id, pidfd, child_pidfd3;
struct file_handle *fh;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
child_pidfd3 = self->child_pidfd3;
self->child_pidfd3 = -EBADF;
EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
EXPECT_EQ(close(child_pidfd3), 0);
EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
}
/*
* Test that we fail to decode a pidfs file handle of a process that has
* already been reaped.
*/
TEST_F(file_handle, pid_has_been_reaped)
{
int mnt_id, pidfd, child_pidfd3;
struct file_handle *fh;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
child_pidfd3 = self->child_pidfd3;
self->child_pidfd3 = -EBADF;
EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
EXPECT_EQ(close(child_pidfd3), 0);
EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_LT(pidfd, 0);
}
/*
* Test valid flags to open a pidfd file handle. Note, that
* PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to
* O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL.
*/
TEST_F(file_handle, open_by_handle_at_valid_flags)
{
int mnt_id;
struct file_handle *fh;
int pidfd = -EBADF;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh,
O_RDONLY |
O_WRONLY |
O_RDWR |
O_NONBLOCK |
O_NDELAY |
O_CLOEXEC |
O_EXCL);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
}
/*
* Test that invalid flags passed to open a pidfd file handle are
* rejected.
*/
TEST_F(file_handle, open_by_handle_at_invalid_flags)
{
int mnt_id;
struct file_handle *fh;
int pidfd = -EBADF;
static const struct invalid_pidfs_file_handle_flags {
int oflag;
const char *oflag_name;
} invalid_pidfs_file_handle_flags[] = {
{ FASYNC, "FASYNC" },
{ O_CREAT, "O_CREAT" },
{ O_NOCTTY, "O_NOCTTY" },
{ O_CREAT, "O_CREAT" },
{ O_TRUNC, "O_TRUNC" },
{ O_APPEND, "O_APPEND" },
{ O_SYNC, "O_SYNC" },
{ O_DSYNC, "O_DSYNC" },
{ O_DIRECT, "O_DIRECT" },
{ O_DIRECTORY, "O_DIRECTORY" },
{ O_NOFOLLOW, "O_NOFOLLOW" },
{ O_NOATIME, "O_NOATIME" },
{ O_PATH, "O_PATH" },
{ O_TMPFILE, "O_TMPFILE" },
/*
* O_LARGEFILE is added implicitly by
* open_by_handle_at() so pidfs simply masks it off.
*/
};
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) {
pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag);
ASSERT_LT(pidfd, 0) {
TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name);
}
}
}
/* Test that lookup fails. */
TEST_F(file_handle, lookup_must_fail)
{
int mnt_id;
struct file_handle *fh;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, ENOTDIR);
ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0);
ASSERT_EQ(errno, ENOTDIR);
}
#ifndef AT_HANDLE_CONNECTABLE
#define AT_HANDLE_CONNECTABLE 0x002
#endif
/*
* Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles
* don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE
* is rejected because it is incompatible with AT_EMPTY_PATH which is
* required with pidfds as we don't support lookup.
*/
TEST_F(file_handle, invalid_name_to_handle_at_flags)
{
int mnt_id;
struct file_handle *fh;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0);
}
#ifndef AT_HANDLE_FID
#define AT_HANDLE_FID 0x200
#endif
/*
* Test that a request with AT_HANDLE_FID always leads to decodable file
* handle as pidfs always provides export operations.
*/
TEST_F(file_handle, valid_name_to_handle_at_flags)
{
int mnt_id, pidfd;
struct file_handle *fh;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0);
ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
}
TEST_HARNESS_MAIN

View File

@ -19,7 +19,6 @@
#include <linux/ioctl.h>
#include "pidfd.h"
#include "../clone3/clone3_selftests.h"
#include "../kselftest_harness.h"
#ifndef PIDFS_IOCTL_MAGIC
@ -118,22 +117,6 @@ FIXTURE(current_nsset)
int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
};
static int sys_waitid(int which, pid_t pid, int options)
{
return syscall(__NR_waitid, which, pid, NULL, options, NULL);
}
pid_t create_child(int *pidfd, unsigned flags)
{
struct __clone_args args = {
.flags = CLONE_PIDFD | flags,
.exit_signal = SIGCHLD,
.pidfd = ptr_to_u64(pidfd),
};
return sys_clone3(&args, sizeof(struct clone_args));
}
static bool switch_timens(void)
{
int fd, ret;
@ -150,28 +133,6 @@ static bool switch_timens(void)
return ret == 0;
}
static ssize_t read_nointr(int fd, void *buf, size_t count)
{
ssize_t ret;
do {
ret = read(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
static ssize_t write_nointr(int fd, const void *buf, size_t count)
{
ssize_t ret;
do {
ret = write(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
FIXTURE_SETUP(current_nsset)
{
int i, proc_fd, ret;
@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset)
_exit(EXIT_SUCCESS);
}
ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0);
self->pidfd = sys_pidfd_open(self->pid, 0);
EXPECT_GE(self->pidfd, 0) {
@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset)
EXPECT_EQ(0, close(self->child_pidfd1));
if (self->child_pidfd2 >= 0)
EXPECT_EQ(0, close(self->child_pidfd2));
ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
}
static int preserve_ns(const int pid, const char *ns)

View File

@ -26,22 +26,11 @@
#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
#endif
static pid_t sys_clone3(struct clone_args *args)
{
return syscall(__NR_clone3, args, sizeof(struct clone_args));
}
static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
struct rusage *ru)
{
return syscall(__NR_waitid, which, pid, info, options, ru);
}
TEST(wait_simple)
{
int pidfd = -1;
pid_t parent_tid = -1;
struct clone_args args = {
struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@ -55,7 +44,7 @@ TEST(wait_simple)
pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
@ -63,18 +52,18 @@ TEST(wait_simple)
pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
pid = sys_clone3(&args);
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0)
exit(EXIT_SUCCESS);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_GE(pid, 0);
ASSERT_EQ(WIFEXITED(info.si_status), true);
ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
@ -89,7 +78,7 @@ TEST(wait_states)
{
int pidfd = -1;
pid_t parent_tid = -1;
struct clone_args args = {
struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@ -102,7 +91,7 @@ TEST(wait_states)
};
ASSERT_EQ(pipe(pfd), 0);
pid = sys_clone3(&args);
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@ -117,28 +106,28 @@ TEST(wait_states)
}
close(pfd[0]);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0);
ASSERT_EQ(write(pfd[1], "C", 1), 1);
close(pfd[1]);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_CONTINUED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_KILLED);
ASSERT_EQ(info.si_pid, parent_tid);
@ -151,7 +140,7 @@ TEST(wait_nonblock)
int pidfd;
unsigned int flags = 0;
pid_t parent_tid = -1;
struct clone_args args = {
struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.flags = CLONE_PARENT_SETTID,
.exit_signal = SIGCHLD,
@ -173,12 +162,12 @@ TEST(wait_nonblock)
SKIP(return, "Skipping PIDFD_NONBLOCK test");
}
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, ECHILD);
EXPECT_EQ(close(pidfd), 0);
pid = sys_clone3(&args);
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@ -201,7 +190,7 @@ TEST(wait_nonblock)
* Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
* child processes exist but none have exited.
*/
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, EAGAIN);
@ -210,19 +199,19 @@ TEST(wait_nonblock)
* WNOHANG raised explicitly when child processes exist but none have
* exited.
*/
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG);
ASSERT_EQ(ret, 0);
ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_EXITED);
ASSERT_EQ(info.si_pid, parent_tid);