Merge branch 'kernel-6.14.pid' into vfs.all

This commit is contained in:
Christian Brauner 2025-01-10 16:18:07 +01:00
commit 03692e7281
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
11 changed files with 521 additions and 36 deletions

View File

@ -108,9 +108,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
extern int pid_max;
extern int pid_max_min, pid_max_max;
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.

View File

@ -30,6 +30,7 @@ struct pid_namespace {
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
int pid_max;
struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
struct fs_pin *bacct;
@ -38,9 +39,14 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
struct work_struct work;
#ifdef CONFIG_SYSCTL
struct ctl_table_set set;
struct ctl_table_header *sysctls;
#if defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope;
#endif
#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
@ -117,6 +123,8 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
int register_pidns_sysctls(struct pid_namespace *pidns);
void unregister_pidns_sysctls(struct pid_namespace *pidns);
static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{

View File

@ -61,10 +61,8 @@ struct pid init_struct_pid = {
}, }
};
int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
static int pid_max_min = RESERVED_PIDS + 1;
static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@ -83,6 +81,7 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
.pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
@ -191,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@ -644,17 +644,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
#ifdef CONFIG_SYSCTL
static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
{
return &task_active_pid_ns(current)->set;
}
static int set_is_seen(struct ctl_table_set *set)
{
return &task_active_pid_ns(current)->set == set;
}
static int pid_table_root_permissions(struct ctl_table_header *head,
const struct ctl_table *table)
{
struct pid_namespace *pidns =
container_of(head->set, struct pid_namespace, set);
int mode = table->mode;
if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
mode = (mode & S_IRWXU) >> 6;
else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
mode = (mode & S_IRWXG) >> 3;
else
mode = mode & S_IROTH;
return (mode << 6) | (mode << 3) | mode;
}
static void pid_table_root_set_ownership(struct ctl_table_header *head,
kuid_t *uid, kgid_t *gid)
{
struct pid_namespace *pidns =
container_of(head->set, struct pid_namespace, set);
kuid_t ns_root_uid;
kgid_t ns_root_gid;
ns_root_uid = make_kuid(pidns->user_ns, 0);
if (uid_valid(ns_root_uid))
*uid = ns_root_uid;
ns_root_gid = make_kgid(pidns->user_ns, 0);
if (gid_valid(ns_root_gid))
*gid = ns_root_gid;
}
static struct ctl_table_root pid_table_root = {
.lookup = pid_table_root_lookup,
.permissions = pid_table_root_permissions,
.set_ownership = pid_table_root_set_ownership,
};
static struct ctl_table pid_table[] = {
{
.procname = "pid_max",
.data = &init_pid_ns.pid_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
};
#endif
int register_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
struct ctl_table *tbl;
setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
if (!tbl)
return -ENOMEM;
tbl->data = &pidns->pid_max;
pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
ARRAY_SIZE(pid_table));
if (!pidns->sysctls) {
kfree(tbl);
retire_sysctl_set(&pidns->set);
return -ENOMEM;
}
#endif
return 0;
}
void unregister_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
const struct ctl_table *tbl;
tbl = pidns->sysctls->ctl_table_arg;
unregister_sysctl_table(pidns->sysctls);
retire_sysctl_set(&pidns->set);
kfree(tbl);
#endif
}
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
@ -665,6 +766,16 @@ void __init pid_idr_init(void)
NULL);
}
static __init int pid_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
/* "kernel" directory will have already been initialized. */
BUG_ON(register_pidns_sysctls(&init_pid_ns));
#endif
return 0;
}
subsys_initcall(pid_namespace_sysctl_init);
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;

View File

@ -70,6 +70,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
static void destroy_pid_namespace_work(struct work_struct *work);
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
@ -105,17 +107,27 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
ns->pid_max = parent_pid_ns->pid_max;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->work, destroy_pid_namespace_work);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
return ns;
out_free_inum:
ns_free_inum(&ns->ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@ -137,12 +149,28 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
unregister_pidns_sysctls(ns);
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
static void destroy_pid_namespace_work(struct work_struct *work)
{
struct pid_namespace *ns =
container_of(work, struct pid_namespace, work);
do {
struct pid_namespace *parent;
parent = ns->parent;
destroy_pid_namespace(ns);
ns = parent;
} while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
}
struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
@ -155,15 +183,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
struct pid_namespace *parent;
while (ns != &init_pid_ns) {
parent = ns->parent;
if (!refcount_dec_and_test(&ns->ns.count))
break;
destroy_pid_namespace(ns);
ns = parent;
}
if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@ -274,6 +295,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1);
@ -281,7 +303,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
@ -289,7 +310,7 @@ static struct ctl_table pid_ns_ctl_table[] = {
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &pid_max,
.extra2 = &init_pid_ns.pid_max,
},
};
#endif /* CONFIG_CHECKPOINT_RESTORE */

View File

@ -1803,15 +1803,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
{
.procname = "pid_max",
.data = &pid_max,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
{
.procname = "panic_on_oops",
.data = &panic_on_oops,

View File

@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
WARN_ON_ONCE(pid_max > (1 << 30));
WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)

View File

@ -717,8 +717,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
extern int pid_max;
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,

View File

@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
tgid_map_max = pid_max;
tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)

View File

@ -1 +1,2 @@
pid_max
regression_enomem

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -g $(KHDR_INCLUDES)
TEST_GEN_PROGS = regression_enomem
TEST_GEN_PROGS = regression_enomem pid_max
LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h

View File

@ -0,0 +1,358 @@
/* SPDX-License-Identifier: GPL-2.0 */
#define _GNU_SOURCE
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/wait.h>
#include "../kselftest_harness.h"
#include "../pidfd/pidfd.h"
#define __STACK_SIZE (8 * 1024 * 1024)
static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
{
char *stack;
pid_t ret;
stack = malloc(__STACK_SIZE);
if (!stack)
return -ENOMEM;
#ifdef __ia64__
ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg);
#else
ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg);
#endif
free(stack);
return ret;
}
static int pid_max_cb(void *data)
{
int fd, ret;
pid_t pid;
ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
if (ret) {
fprintf(stderr, "%m - Failed to make rootfs private mount\n");
return -1;
}
umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret) {
fprintf(stderr, "%m - Failed to mount proc\n");
return -1;
}
fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
if (fd < 0) {
fprintf(stderr, "%m - Failed to open pid_max\n");
return -1;
}
ret = write(fd, "500", sizeof("500") - 1);
if (ret < 0) {
fprintf(stderr, "%m - Failed to write pid_max\n");
return -1;
}
for (int i = 0; i < 501; i++) {
pid = fork();
if (pid == 0)
exit(EXIT_SUCCESS);
wait_for_pid(pid);
if (pid > 500) {
fprintf(stderr, "Managed to create pid number beyond limit\n");
return -1;
}
}
return 0;
}
static int pid_max_nested_inner(void *data)
{
int fret = -1;
pid_t pids[2];
int fd, i, ret;
ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
if (ret) {
fprintf(stderr, "%m - Failed to make rootfs private mount\n");
return fret;
}
umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret) {
fprintf(stderr, "%m - Failed to mount proc\n");
return fret;
}
fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
if (fd < 0) {
fprintf(stderr, "%m - Failed to open pid_max\n");
return fret;
}
ret = write(fd, "500", sizeof("500") - 1);
close(fd);
if (ret < 0) {
fprintf(stderr, "%m - Failed to write pid_max\n");
return fret;
}
pids[0] = fork();
if (pids[0] < 0) {
fprintf(stderr, "Failed to create first new process\n");
return fret;
}
if (pids[0] == 0)
exit(EXIT_SUCCESS);
pids[1] = fork();
wait_for_pid(pids[0]);
if (pids[1] >= 0) {
if (pids[1] == 0)
exit(EXIT_SUCCESS);
wait_for_pid(pids[1]);
fprintf(stderr, "Managed to create process even though ancestor pid namespace had a limit\n");
return fret;
}
/* Now make sure that we wrap pids at 400. */
for (i = 0; i < 510; i++) {
pid_t pid;
pid = fork();
if (pid < 0)
return fret;
if (pid == 0)
exit(EXIT_SUCCESS);
wait_for_pid(pid);
if (pid >= 500) {
fprintf(stderr, "Managed to create process with pid %d beyond configured limit\n", pid);
return fret;
}
}
return 0;
}
static int pid_max_nested_outer(void *data)
{
int fret = -1, nr_procs = 400;
pid_t pids[1000];
int fd, i, ret;
pid_t pid;
ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
if (ret) {
fprintf(stderr, "%m - Failed to make rootfs private mount\n");
return fret;
}
umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret) {
fprintf(stderr, "%m - Failed to mount proc\n");
return fret;
}
fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
if (fd < 0) {
fprintf(stderr, "%m - Failed to open pid_max\n");
return fret;
}
ret = write(fd, "400", sizeof("400") - 1);
close(fd);
if (ret < 0) {
fprintf(stderr, "%m - Failed to write pid_max\n");
return fret;
}
/*
* Create 397 processes. This leaves room for do_clone() (398) and
* one more 399. So creating another process needs to fail.
*/
for (nr_procs = 0; nr_procs < 396; nr_procs++) {
pid = fork();
if (pid < 0)
goto reap;
if (pid == 0)
exit(EXIT_SUCCESS);
pids[nr_procs] = pid;
}
pid = do_clone(pid_max_nested_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
if (pid < 0) {
fprintf(stderr, "%m - Failed to clone nested pidns\n");
goto reap;
}
if (wait_for_pid(pid)) {
fprintf(stderr, "%m - Nested pid_max failed\n");
goto reap;
}
fret = 0;
reap:
for (int i = 0; i < nr_procs; i++)
wait_for_pid(pids[i]);
return fret;
}
static int pid_max_nested_limit_inner(void *data)
{
int fret = -1, nr_procs = 400;
int fd, ret;
pid_t pid;
pid_t pids[1000];
ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
if (ret) {
fprintf(stderr, "%m - Failed to make rootfs private mount\n");
return fret;
}
umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret) {
fprintf(stderr, "%m - Failed to mount proc\n");
return fret;
}
fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
if (fd < 0) {
fprintf(stderr, "%m - Failed to open pid_max\n");
return fret;
}
ret = write(fd, "500", sizeof("500") - 1);
close(fd);
if (ret < 0) {
fprintf(stderr, "%m - Failed to write pid_max\n");
return fret;
}
for (nr_procs = 0; nr_procs < 500; nr_procs++) {
pid = fork();
if (pid < 0)
break;
if (pid == 0)
exit(EXIT_SUCCESS);
pids[nr_procs] = pid;
}
if (nr_procs >= 400) {
fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
goto reap;
}
fret = 0;
reap:
for (int i = 0; i < nr_procs; i++)
wait_for_pid(pids[i]);
return fret;
}
static int pid_max_nested_limit_outer(void *data)
{
int fd, ret;
pid_t pid;
ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
if (ret) {
fprintf(stderr, "%m - Failed to make rootfs private mount\n");
return -1;
}
umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret) {
fprintf(stderr, "%m - Failed to mount proc\n");
return -1;
}
fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
if (fd < 0) {
fprintf(stderr, "%m - Failed to open pid_max\n");
return -1;
}
ret = write(fd, "400", sizeof("400") - 1);
close(fd);
if (ret < 0) {
fprintf(stderr, "%m - Failed to write pid_max\n");
return -1;
}
pid = do_clone(pid_max_nested_limit_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
if (pid < 0) {
fprintf(stderr, "%m - Failed to clone nested pidns\n");
return -1;
}
if (wait_for_pid(pid)) {
fprintf(stderr, "%m - Nested pid_max failed\n");
return -1;
}
return 0;
}
TEST(pid_max_simple)
{
pid_t pid;
pid = do_clone(pid_max_cb, NULL, CLONE_NEWPID | CLONE_NEWNS);
ASSERT_GT(pid, 0);
ASSERT_EQ(0, wait_for_pid(pid));
}
TEST(pid_max_nested_limit)
{
pid_t pid;
pid = do_clone(pid_max_nested_limit_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
ASSERT_GT(pid, 0);
ASSERT_EQ(0, wait_for_pid(pid));
}
TEST(pid_max_nested)
{
pid_t pid;
pid = do_clone(pid_max_nested_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
ASSERT_GT(pid, 0);
ASSERT_EQ(0, wait_for_pid(pid));
}
TEST_HARNESS_MAIN