2019-05-19 13:08:55 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Generic pidhash and scalable, time-bounded PID allocator
|
|
|
|
*
|
2012-12-06 10:39:54 +01:00
|
|
|
* (C) 2002-2003 Nadia Yvette Chambers, IBM
|
|
|
|
* (C) 2004 Nadia Yvette Chambers, Oracle
|
2005-04-16 15:20:36 -07:00
|
|
|
* (C) 2002-2004 Ingo Molnar, Red Hat
|
|
|
|
*
|
|
|
|
* pid-structures are backing objects for tasks sharing a given ID to chain
|
|
|
|
* against. There is very little to them aside from hashing them and
|
|
|
|
* parking tasks using given ID's on a list.
|
|
|
|
*
|
|
|
|
* The hash is always changed with the tasklist_lock write-acquired,
|
|
|
|
* and the hash is only accessed with the tasklist_lock at least
|
|
|
|
* read-acquired, so there's no additional SMP locking needed here.
|
|
|
|
*
|
|
|
|
* We have a list of bitmap pages, which bitmaps represent the PID space.
|
|
|
|
* Allocating and freeing PIDs is completely lockless. The worst-case
|
|
|
|
* allocation scenario when all but one out of 1 million PIDs possible are
|
|
|
|
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
|
|
|
|
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
|
2007-10-18 23:40:10 -07:00
|
|
|
*
|
|
|
|
* Pid namespaces:
|
|
|
|
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
|
|
|
|
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
|
|
|
|
* Many thanks to Oleg Nesterov for comments and help
|
|
|
|
*
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
2011-05-23 14:51:41 -04:00
|
|
|
#include <linux/export.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/init.h>
|
2008-05-12 21:21:05 +02:00
|
|
|
#include <linux/rculist.h>
|
2018-10-30 15:09:49 -07:00
|
|
|
#include <linux/memblock.h>
|
2006-12-08 02:37:58 -08:00
|
|
|
#include <linux/pid_namespace.h>
|
2007-05-10 22:23:00 -07:00
|
|
|
#include <linux/init_task.h>
|
2007-10-18 23:40:13 -07:00
|
|
|
#include <linux/syscalls.h>
|
2013-04-12 01:50:06 +01:00
|
|
|
#include <linux/proc_ns.h>
|
2019-07-16 16:30:06 -07:00
|
|
|
#include <linux/refcount.h>
|
2019-05-24 12:43:51 +02:00
|
|
|
#include <linux/anon_inodes.h>
|
|
|
|
#include <linux/sched/signal.h>
|
2017-02-08 18:51:36 +01:00
|
|
|
#include <linux/sched/task.h>
|
2017-11-17 15:30:30 -08:00
|
|
|
#include <linux/idr.h>
|
2024-02-12 16:32:38 +01:00
|
|
|
#include <linux/pidfs.h>
|
2020-06-09 16:21:38 -07:00
|
|
|
#include <net/sock.h>
|
2020-09-02 12:21:27 +02:00
|
|
|
#include <uapi/linux/pidfd.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-01-02 15:12:01 +00:00
|
|
|
struct pid init_struct_pid = {
|
2019-07-16 16:30:06 -07:00
|
|
|
.count = REFCOUNT_INIT(1),
|
2018-01-02 15:12:01 +00:00
|
|
|
.tasks = {
|
|
|
|
{ .first = NULL },
|
|
|
|
{ .first = NULL },
|
|
|
|
{ .first = NULL },
|
|
|
|
},
|
|
|
|
.level = 0,
|
|
|
|
.numbers = { {
|
|
|
|
.nr = 0,
|
|
|
|
.ns = &init_pid_ns,
|
|
|
|
}, }
|
|
|
|
};
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
int pid_max = PID_MAX_DEFAULT;
|
|
|
|
|
|
|
|
int pid_max_min = RESERVED_PIDS + 1;
|
|
|
|
int pid_max_max = PID_MAX_LIMIT;
|
2024-02-12 16:32:38 +01:00
|
|
|
/*
|
|
|
|
* Pseudo filesystems start inode numbering after one. We use Reserved
|
|
|
|
* PIDs as a natural offset.
|
|
|
|
*/
|
|
|
|
static u64 pidfs_ino = RESERVED_PIDS;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PID-map pages start out as NULL, they get allocated upon
|
|
|
|
* first use and are never deallocated. This way a low pid_max
|
|
|
|
* value does not cause lots of bitmaps to be allocated, but
|
|
|
|
* the scheme scales to up to 4 million PIDs, runtime.
|
|
|
|
*/
|
2006-12-08 02:37:58 -08:00
|
|
|
struct pid_namespace init_pid_ns = {
|
2020-08-03 13:16:32 +03:00
|
|
|
.ns.count = REFCOUNT_INIT(2),
|
2018-04-10 16:36:52 -07:00
|
|
|
.idr = IDR_INIT(init_pid_ns.idr),
|
2017-11-17 15:30:34 -08:00
|
|
|
.pid_allocated = PIDNS_ADDING,
|
2007-10-18 23:40:04 -07:00
|
|
|
.level = 0,
|
|
|
|
.child_reaper = &init_task,
|
2012-08-02 04:25:10 -07:00
|
|
|
.user_ns = &init_user_ns,
|
2014-10-31 22:56:04 -04:00
|
|
|
.ns.inum = PROC_PID_INIT_INO,
|
2014-11-01 02:32:53 -04:00
|
|
|
#ifdef CONFIG_PID_NS
|
|
|
|
.ns.ops = &pidns_operations,
|
|
|
|
#endif
|
memfd: replace ratcheting feature from vm.memfd_noexec with hierarchy
This sysctl has the very unusual behaviour of not allowing any user (even
CAP_SYS_ADMIN) to reduce the restriction setting, meaning that if you were
to set this sysctl to a more restrictive option in the host pidns you
would need to reboot your machine in order to reset it.
The justification given in [1] is that this is a security feature and thus
it should not be possible to disable. Aside from the fact that we have
plenty of security-related sysctls that can be disabled after being
enabled (fs.protected_symlinks for instance), the protection provided by
the sysctl is to stop users from being able to create a binary and then
execute it. A user with CAP_SYS_ADMIN can trivially do this without
memfd_create(2):
% cat mount-memfd.c
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/mount.h>
#define SHELLCODE "#!/bin/echo this file was executed from this totally private tmpfs:"
int main(void)
{
int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
assert(fsfd >= 0);
assert(!fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 2));
int dfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
assert(dfd >= 0);
int execfd = openat(dfd, "exe", O_CREAT | O_RDWR | O_CLOEXEC, 0782);
assert(execfd >= 0);
assert(write(execfd, SHELLCODE, strlen(SHELLCODE)) == strlen(SHELLCODE));
assert(!close(execfd));
char *execpath = NULL;
char *argv[] = { "bad-exe", NULL }, *envp[] = { NULL };
execfd = openat(dfd, "exe", O_PATH | O_CLOEXEC);
assert(execfd >= 0);
assert(asprintf(&execpath, "/proc/self/fd/%d", execfd) > 0);
assert(!execve(execpath, argv, envp));
}
% ./mount-memfd
this file was executed from this totally private tmpfs: /proc/self/fd/5
%
Given that it is possible for CAP_SYS_ADMIN users to create executable
binaries without memfd_create(2) and without touching the host filesystem
(not to mention the many other things a CAP_SYS_ADMIN process would be
able to do that would be equivalent or worse), it seems strange to cause a
fair amount of headache to admins when there doesn't appear to be an
actual security benefit to blocking this. There appear to be concerns
about confused-deputy-esque attacks[2] but a confused deputy that can
write to arbitrary sysctls is a bigger security issue than executable
memfds.
/* New API */
The primary requirement from the original author appears to be more based
on the need to be able to restrict an entire system in a hierarchical
manner[3], such that child namespaces cannot re-enable executable memfds.
So, implement that behaviour explicitly -- the vm.memfd_noexec scope is
evaluated up the pidns tree to &init_pid_ns and you have the most
restrictive value applied to you. The new lower limit you can set
vm.memfd_noexec is whatever limit applies to your parent.
Note that a pidns will inherit a copy of the parent pidns's effective
vm.memfd_noexec setting at unshare() time. This matches the existing
behaviour, and it also ensures that a pidns will never have its
vm.memfd_noexec setting *lowered* behind its back (but it will be raised
if the parent raises theirs).
/* Backwards Compatibility */
As the previous version of the sysctl didn't allow you to lower the
setting at all, there are no backwards compatibility issues with this
aspect of the change.
However it should be noted that now that the setting is completely
hierarchical. Previously, a cloned pidns would just copy the current
pidns setting, meaning that if the parent's vm.memfd_noexec was changed it
wouldn't propoagate to existing pid namespaces. Now, the restriction
applies recursively. This is a uAPI change, however:
* The sysctl is very new, having been merged in 6.3.
* Several aspects of the sysctl were broken up until this patchset and
the other patchset by Jeff Xu last month.
And thus it seems incredibly unlikely that any real users would run into
this issue. In the worst case, if this causes userspace isues we could
make it so that modifying the setting follows the hierarchical rules but
the restriction checking uses the cached copy.
[1]: https://lore.kernel.org/CABi2SkWnAgHK1i6iqSqPMYuNEhtHBkO8jUuCvmG3RmUB5TKHJw@mail.gmail.com/
[2]: https://lore.kernel.org/CALmYWFs_dNCzw_pW1yRAo4bGCPEtykroEQaowNULp7svwMLjOg@mail.gmail.com/
[3]: https://lore.kernel.org/CALmYWFuahdUF7cT4cm7_TGLqPanuHXJ-hVSfZt7vpTnc18DPrw@mail.gmail.com/
Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-4-7ff9e3e10ba6@cyphar.com
Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Cc: Dominique Martinet <asmadeus@codewreck.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Verkamp <dverkamp@chromium.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-14 18:41:00 +10:00
|
|
|
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
|
|
|
|
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
|
|
|
|
#endif
|
2006-10-02 02:17:24 -07:00
|
|
|
};
|
2007-10-18 23:40:06 -07:00
|
|
|
EXPORT_SYMBOL_GPL(init_pid_ns);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
/*
|
|
|
|
* Note: disable interrupts while the pidmap_lock is held as an
|
|
|
|
* interrupt might come in and do read_lock(&tasklist_lock).
|
|
|
|
*
|
|
|
|
* If we don't disable interrupts there is a nasty deadlock between
|
|
|
|
* detach_pid()->free_pid() and another cpu that does
|
|
|
|
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
|
|
|
|
* read_lock(&tasklist_lock);
|
|
|
|
*
|
|
|
|
* After we clean up the tasklist_lock and know there are no
|
|
|
|
* irq handlers that take it we can leave the interrupts enabled.
|
|
|
|
* For now it is easier to be safe than to prove it can't happen.
|
|
|
|
*/
|
2006-10-02 02:17:24 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
|
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
void put_pid(struct pid *pid)
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
{
|
2007-10-18 23:39:48 -07:00
|
|
|
struct pid_namespace *ns;
|
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
if (!pid)
|
|
|
|
return;
|
2007-10-18 23:39:48 -07:00
|
|
|
|
2007-10-18 23:40:05 -07:00
|
|
|
ns = pid->numbers[pid->level].ns;
|
2019-07-16 16:30:06 -07:00
|
|
|
if (refcount_dec_and_test(&pid->count)) {
|
2007-10-18 23:39:48 -07:00
|
|
|
kmem_cache_free(ns->pid_cachep, pid);
|
2007-10-18 23:40:09 -07:00
|
|
|
put_pid_ns(ns);
|
2007-10-18 23:40:05 -07:00
|
|
|
}
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
}
|
2006-10-02 02:17:11 -07:00
|
|
|
EXPORT_SYMBOL_GPL(put_pid);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
|
|
|
|
static void delayed_put_pid(struct rcu_head *rhp)
|
|
|
|
{
|
|
|
|
struct pid *pid = container_of(rhp, struct pid, rcu);
|
|
|
|
put_pid(pid);
|
|
|
|
}
|
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
void free_pid(struct pid *pid)
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
{
|
|
|
|
/* We can be called with write_lock_irq(&tasklist_lock) held */
|
2007-10-18 23:40:05 -07:00
|
|
|
int i;
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pidmap_lock, flags);
|
2012-08-01 10:33:47 -07:00
|
|
|
for (i = 0; i <= pid->level; i++) {
|
|
|
|
struct upid *upid = pid->numbers + i;
|
2012-08-01 15:03:42 -07:00
|
|
|
struct pid_namespace *ns = upid->ns;
|
2017-11-17 15:30:34 -08:00
|
|
|
switch (--ns->pid_allocated) {
|
2013-08-29 13:56:50 -07:00
|
|
|
case 2:
|
2012-08-01 15:03:42 -07:00
|
|
|
case 1:
|
|
|
|
/* When all that is left in the pid namespace
|
|
|
|
* is the reaper wake up the reaper. The reaper
|
|
|
|
* may be sleeping in zap_pid_ns_processes().
|
|
|
|
*/
|
|
|
|
wake_up_process(ns->child_reaper);
|
|
|
|
break;
|
2017-11-17 15:30:34 -08:00
|
|
|
case PIDNS_ADDING:
|
2013-09-30 13:45:27 -07:00
|
|
|
/* Handle a fork failure of the first process */
|
|
|
|
WARN_ON(ns->child_reaper);
|
2017-11-17 15:30:34 -08:00
|
|
|
ns->pid_allocated = 0;
|
2012-08-01 15:03:42 -07:00
|
|
|
break;
|
2010-07-12 18:50:25 -07:00
|
|
|
}
|
2017-11-17 15:30:30 -08:00
|
|
|
|
|
|
|
idr_remove(&ns->idr, upid->nr);
|
2012-08-01 10:33:47 -07:00
|
|
|
}
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
spin_unlock_irqrestore(&pidmap_lock, flags);
|
|
|
|
|
|
|
|
call_rcu(&pid->rcu, delayed_put_pid);
|
|
|
|
}
|
|
|
|
|
2019-11-15 13:36:20 +01:00
|
|
|
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
|
|
|
size_t set_tid_size)
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
{
|
|
|
|
struct pid *pid;
|
|
|
|
enum pid_type type;
|
2007-10-18 23:40:05 -07:00
|
|
|
int i, nr;
|
|
|
|
struct pid_namespace *tmp;
|
2007-10-18 23:40:06 -07:00
|
|
|
struct upid *upid;
|
2015-04-16 12:47:38 -07:00
|
|
|
int retval = -ENOMEM;
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
|
2019-11-15 13:36:20 +01:00
|
|
|
/*
|
|
|
|
* set_tid_size contains the size of the set_tid array. Starting at
|
|
|
|
* the most nested currently active PID namespace it tells alloc_pid()
|
|
|
|
* which PID to set for a process in that most nested PID namespace
|
|
|
|
* up to set_tid_size PID namespaces. It does not have to set the PID
|
|
|
|
* for a process in all nested PID namespaces but set_tid_size must
|
|
|
|
* never be greater than the current ns->level + 1.
|
|
|
|
*/
|
|
|
|
if (set_tid_size > ns->level + 1)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2007-10-18 23:39:48 -07:00
|
|
|
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
if (!pid)
|
2015-04-16 12:47:38 -07:00
|
|
|
return ERR_PTR(retval);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
|
2007-10-18 23:40:05 -07:00
|
|
|
tmp = ns;
|
2012-08-01 10:33:47 -07:00
|
|
|
pid->level = ns->level;
|
2017-11-17 15:30:30 -08:00
|
|
|
|
2007-10-18 23:40:05 -07:00
|
|
|
for (i = ns->level; i >= 0; i--) {
|
2019-11-15 13:36:20 +01:00
|
|
|
int tid = 0;
|
|
|
|
|
|
|
|
if (set_tid_size) {
|
|
|
|
tid = set_tid[ns->level - i];
|
|
|
|
|
|
|
|
retval = -EINVAL;
|
|
|
|
if (tid < 1 || tid >= pid_max)
|
|
|
|
goto out_free;
|
|
|
|
/*
|
|
|
|
* Also fail if a PID != 1 is requested and
|
|
|
|
* no PID 1 exists.
|
|
|
|
*/
|
|
|
|
if (tid != 1 && !tmp->child_reaper)
|
|
|
|
goto out_free;
|
|
|
|
retval = -EPERM;
|
2020-07-19 12:04:12 +02:00
|
|
|
if (!checkpoint_restore_ns_capable(tmp->user_ns))
|
2019-11-15 13:36:20 +01:00
|
|
|
goto out_free;
|
|
|
|
set_tid_size--;
|
|
|
|
}
|
2017-11-17 15:30:30 -08:00
|
|
|
|
|
|
|
idr_preload(GFP_KERNEL);
|
|
|
|
spin_lock_irq(&pidmap_lock);
|
|
|
|
|
2019-11-15 13:36:20 +01:00
|
|
|
if (tid) {
|
|
|
|
nr = idr_alloc(&tmp->idr, NULL, tid,
|
|
|
|
tid + 1, GFP_ATOMIC);
|
|
|
|
/*
|
|
|
|
* If ENOSPC is returned it means that the PID is
|
|
|
|
* alreay in use. Return EEXIST in that case.
|
|
|
|
*/
|
|
|
|
if (nr == -ENOSPC)
|
|
|
|
nr = -EEXIST;
|
|
|
|
} else {
|
|
|
|
int pid_min = 1;
|
|
|
|
/*
|
|
|
|
* init really needs pid 1, but after reaching the
|
|
|
|
* maximum wrap back to RESERVED_PIDS
|
|
|
|
*/
|
|
|
|
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
|
|
|
|
pid_min = RESERVED_PIDS;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Store a null pointer so find_pid_ns does not find
|
|
|
|
* a partially initialized PID (see below).
|
|
|
|
*/
|
|
|
|
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
|
|
|
pid_max, GFP_ATOMIC);
|
|
|
|
}
|
2017-11-17 15:30:30 -08:00
|
|
|
spin_unlock_irq(&pidmap_lock);
|
|
|
|
idr_preload_end();
|
|
|
|
|
remove lots of IS_ERR_VALUE abuses
Most users of IS_ERR_VALUE() in the kernel are wrong, as they
pass an 'int' into a function that takes an 'unsigned long'
argument. This happens to work because the type is sign-extended
on 64-bit architectures before it gets converted into an
unsigned type.
However, anything that passes an 'unsigned short' or 'unsigned int'
argument into IS_ERR_VALUE() is guaranteed to be broken, as are
8-bit integers and types that are wider than 'unsigned long'.
Andrzej Hajda has already fixed a lot of the worst abusers that
were causing actual bugs, but it would be nice to prevent any
users that are not passing 'unsigned long' arguments.
This patch changes all users of IS_ERR_VALUE() that I could find
on 32-bit ARM randconfig builds and x86 allmodconfig. For the
moment, this doesn't change the definition of IS_ERR_VALUE()
because there are probably still architecture specific users
elsewhere.
Almost all the warnings I got are for files that are better off
using 'if (err)' or 'if (err < 0)'.
The only legitimate user I could find that we get a warning for
is the (32-bit only) freescale fman driver, so I did not remove
the IS_ERR_VALUE() there but changed the type to 'unsigned long'.
For 9pfs, I just worked around one user whose calling conventions
are so obscure that I did not dare change the behavior.
I was using this definition for testing:
#define IS_ERR_VALUE(x) ((unsigned long*)NULL == (typeof (x)*)NULL && \
unlikely((unsigned long long)(x) >= (unsigned long long)(typeof(x))-MAX_ERRNO))
which ends up making all 16-bit or wider types work correctly with
the most plausible interpretation of what IS_ERR_VALUE() was supposed
to return according to its users, but also causes a compile-time
warning for any users that do not pass an 'unsigned long' argument.
I suggested this approach earlier this year, but back then we ended
up deciding to just fix the users that are obviously broken. After
the initial warning that caused me to get involved in the discussion
(fs/gfs2/dir.c) showed up again in the mainline kernel, Linus
asked me to send the whole thing again.
[ Updated the 9p parts as per Al Viro - Linus ]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lkml.org/lkml/2016/1/7/363
Link: https://lkml.org/lkml/2016/5/27/486
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> # For nvmem part
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-27 23:23:25 +02:00
|
|
|
if (nr < 0) {
|
2018-09-20 12:22:25 -07:00
|
|
|
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
|
2007-10-18 23:40:05 -07:00
|
|
|
goto out_free;
|
2015-04-16 12:47:38 -07:00
|
|
|
}
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
|
2007-10-18 23:40:05 -07:00
|
|
|
pid->numbers[i].nr = nr;
|
|
|
|
pid->numbers[i].ns = tmp;
|
|
|
|
tmp = tmp->parent;
|
|
|
|
}
|
|
|
|
|
2020-03-08 14:29:17 +01:00
|
|
|
/*
|
|
|
|
* ENOMEM is not the most obvious choice especially for the case
|
|
|
|
* where the child subreaper has already exited and the pid
|
|
|
|
* namespace denies the creation of any new processes. But ENOMEM
|
|
|
|
* is what we have exposed to userspace for a long time and it is
|
|
|
|
* documented behavior for pid namespaces. So we can't easily
|
|
|
|
* change it even if there were an error code better suited.
|
|
|
|
*/
|
2020-03-06 11:23:14 -06:00
|
|
|
retval = -ENOMEM;
|
|
|
|
|
2007-10-18 23:40:09 -07:00
|
|
|
get_pid_ns(ns);
|
2019-07-16 16:30:06 -07:00
|
|
|
refcount_set(&pid->count, 1);
|
2020-04-07 09:43:04 -05:00
|
|
|
spin_lock_init(&pid->lock);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
for (type = 0; type < PIDTYPE_MAX; ++type)
|
|
|
|
INIT_HLIST_HEAD(&pid->tasks[type]);
|
|
|
|
|
2019-04-30 12:21:53 -04:00
|
|
|
init_waitqueue_head(&pid->wait_pidfd);
|
2020-02-19 18:22:26 -06:00
|
|
|
INIT_HLIST_HEAD(&pid->inodes);
|
2019-04-30 12:21:53 -04:00
|
|
|
|
2009-12-15 16:47:40 -08:00
|
|
|
upid = pid->numbers + ns->level;
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
spin_lock_irq(&pidmap_lock);
|
2017-11-17 15:30:34 -08:00
|
|
|
if (!(ns->pid_allocated & PIDNS_ADDING))
|
2010-07-12 18:50:25 -07:00
|
|
|
goto out_unlock;
|
2024-02-19 16:30:57 +01:00
|
|
|
pid->stashed = NULL;
|
2024-02-12 16:32:38 +01:00
|
|
|
pid->ino = ++pidfs_ino;
|
2012-08-01 10:33:47 -07:00
|
|
|
for ( ; upid >= pid->numbers; --upid) {
|
2017-11-17 15:30:30 -08:00
|
|
|
/* Make the PID visible to find_pid_ns. */
|
|
|
|
idr_replace(&upid->ns->idr, pid, upid->nr);
|
2017-11-17 15:30:34 -08:00
|
|
|
upid->ns->pid_allocated++;
|
2012-08-01 10:33:47 -07:00
|
|
|
}
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
spin_unlock_irq(&pidmap_lock);
|
|
|
|
|
|
|
|
return pid;
|
|
|
|
|
2010-07-12 18:50:25 -07:00
|
|
|
out_unlock:
|
2013-02-12 13:46:23 -08:00
|
|
|
spin_unlock_irq(&pidmap_lock);
|
2014-12-10 15:55:25 -08:00
|
|
|
put_pid_ns(ns);
|
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
out_free:
|
2017-11-17 15:30:30 -08:00
|
|
|
spin_lock_irq(&pidmap_lock);
|
2018-12-28 07:22:26 -08:00
|
|
|
while (++i <= ns->level) {
|
|
|
|
upid = pid->numbers + i;
|
|
|
|
idr_remove(&upid->ns->idr, upid->nr);
|
|
|
|
}
|
2017-11-17 15:30:30 -08:00
|
|
|
|
2017-12-22 12:37:43 -06:00
|
|
|
/* On failure to allocate the first pid, reset the state */
|
|
|
|
if (ns->pid_allocated == PIDNS_ADDING)
|
|
|
|
idr_set_cursor(&ns->idr, 0);
|
|
|
|
|
2017-11-17 15:30:30 -08:00
|
|
|
spin_unlock_irq(&pidmap_lock);
|
2007-10-18 23:40:05 -07:00
|
|
|
|
2007-10-18 23:39:48 -07:00
|
|
|
kmem_cache_free(ns->pid_cachep, pid);
|
2015-04-16 12:47:38 -07:00
|
|
|
return ERR_PTR(retval);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
}
|
|
|
|
|
2012-12-21 20:27:12 -08:00
|
|
|
void disable_pid_allocation(struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
spin_lock_irq(&pidmap_lock);
|
2017-11-17 15:30:34 -08:00
|
|
|
ns->pid_allocated &= ~PIDNS_ADDING;
|
2012-12-21 20:27:12 -08:00
|
|
|
spin_unlock_irq(&pidmap_lock);
|
|
|
|
}
|
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-11-17 15:30:34 -08:00
|
|
|
return idr_find(&ns->idr, nr);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2007-10-18 23:40:06 -07:00
|
|
|
EXPORT_SYMBOL_GPL(find_pid_ns);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-18 23:40:19 -07:00
|
|
|
struct pid *find_vpid(int nr)
|
|
|
|
{
|
2010-03-02 14:51:53 -08:00
|
|
|
return find_pid_ns(nr, task_active_pid_ns(current));
|
2007-10-18 23:40:19 -07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(find_vpid);
|
|
|
|
|
2017-09-26 13:06:43 -05:00
|
|
|
static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
|
|
|
|
{
|
|
|
|
return (type == PIDTYPE_PID) ?
|
|
|
|
&task->thread_pid :
|
|
|
|
&task->signal->pids[type];
|
|
|
|
}
|
|
|
|
|
2007-05-10 22:22:58 -07:00
|
|
|
/*
|
|
|
|
* attach_pid() must be called with the tasklist_lock write-held.
|
|
|
|
*/
|
2013-07-03 15:08:31 -07:00
|
|
|
void attach_pid(struct task_struct *task, enum pid_type type)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-09-26 13:06:43 -05:00
|
|
|
struct pid *pid = *task_pid_ptr(task, type);
|
|
|
|
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-04-30 00:54:26 -07:00
|
|
|
static void __change_pid(struct task_struct *task, enum pid_type type,
|
|
|
|
struct pid *new)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-09-26 13:06:43 -05:00
|
|
|
struct pid **pid_ptr = task_pid_ptr(task, type);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
struct pid *pid;
|
|
|
|
int tmp;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2017-09-26 13:06:43 -05:00
|
|
|
pid = *pid_ptr;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2017-09-26 13:06:43 -05:00
|
|
|
hlist_del_rcu(&task->pid_links[type]);
|
|
|
|
*pid_ptr = new;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2024-02-02 14:12:26 +01:00
|
|
|
if (type == PIDTYPE_PID) {
|
|
|
|
WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
|
|
|
|
wake_up_all(&pid->wait_pidfd);
|
|
|
|
}
|
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
|
2019-10-17 12:18:30 +02:00
|
|
|
if (pid_has_task(pid, tmp))
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
return;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
free_pid(pid);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-04-30 00:54:26 -07:00
|
|
|
void detach_pid(struct task_struct *task, enum pid_type type)
|
|
|
|
{
|
|
|
|
__change_pid(task, type, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
void change_pid(struct task_struct *task, enum pid_type type,
|
|
|
|
struct pid *pid)
|
|
|
|
{
|
|
|
|
__change_pid(task, type, pid);
|
2013-07-03 15:08:31 -07:00
|
|
|
attach_pid(task, type);
|
2008-04-30 00:54:26 -07:00
|
|
|
}
|
|
|
|
|
2020-04-19 06:35:02 -05:00
|
|
|
void exchange_tids(struct task_struct *left, struct task_struct *right)
|
|
|
|
{
|
|
|
|
struct pid *pid1 = left->thread_pid;
|
|
|
|
struct pid *pid2 = right->thread_pid;
|
|
|
|
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
|
|
|
|
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
|
|
|
|
|
|
|
|
/* Swap the single entry tid lists */
|
|
|
|
hlists_swap_heads_rcu(head1, head2);
|
|
|
|
|
|
|
|
/* Swap the per task_struct pid */
|
|
|
|
rcu_assign_pointer(left->thread_pid, pid2);
|
|
|
|
rcu_assign_pointer(right->thread_pid, pid1);
|
|
|
|
|
|
|
|
/* Swap the cached value */
|
|
|
|
WRITE_ONCE(left->pid, pid_nr(pid2));
|
|
|
|
WRITE_ONCE(right->pid, pid_nr(pid1));
|
|
|
|
}
|
|
|
|
|
2006-09-27 01:51:06 -07:00
|
|
|
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
|
2008-02-08 04:19:53 -08:00
|
|
|
void transfer_pid(struct task_struct *old, struct task_struct *new,
|
2006-09-27 01:51:06 -07:00
|
|
|
enum pid_type type)
|
|
|
|
{
|
2024-02-02 14:12:55 +01:00
|
|
|
WARN_ON_ONCE(type == PIDTYPE_PID);
|
2017-09-26 13:06:43 -05:00
|
|
|
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
|
2006-09-27 01:51:06 -07:00
|
|
|
}
|
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
struct task_struct *result = NULL;
|
|
|
|
if (pid) {
|
|
|
|
struct hlist_node *first;
|
2010-02-25 16:55:13 +01:00
|
|
|
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
|
2010-03-03 07:46:56 -08:00
|
|
|
lockdep_tasklist_lock_is_held());
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
if (first)
|
2017-09-26 13:06:43 -05:00
|
|
|
result = hlist_entry(first, struct task_struct, pid_links[(type)]);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
2008-02-07 00:13:21 -08:00
|
|
|
EXPORT_SYMBOL(pid_task);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
/*
|
2010-03-05 13:42:56 -08:00
|
|
|
* Must be called under rcu_read_lock().
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
*/
|
2009-06-17 16:27:51 -07:00
|
|
|
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
{
|
2015-06-18 15:50:02 -07:00
|
|
|
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
|
|
|
|
"find_task_by_pid_ns() needs rcu_read_lock() protection");
|
2009-06-17 16:27:51 -07:00
|
|
|
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-18 23:40:16 -07:00
|
|
|
struct task_struct *find_task_by_vpid(pid_t vnr)
|
|
|
|
{
|
2010-03-02 14:51:53 -08:00
|
|
|
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
|
2007-10-18 23:40:16 -07:00
|
|
|
}
|
|
|
|
|
2018-02-06 15:40:17 -08:00
|
|
|
struct task_struct *find_get_task_by_vpid(pid_t nr)
|
|
|
|
{
|
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
task = find_task_by_vpid(nr);
|
|
|
|
if (task)
|
|
|
|
get_task_struct(task);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return task;
|
|
|
|
}
|
|
|
|
|
2006-10-02 02:18:59 -07:00
|
|
|
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
|
|
|
|
{
|
|
|
|
struct pid *pid;
|
|
|
|
rcu_read_lock();
|
2017-09-26 13:06:43 -05:00
|
|
|
pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
|
2006-10-02 02:18:59 -07:00
|
|
|
rcu_read_unlock();
|
|
|
|
return pid;
|
|
|
|
}
|
2011-02-01 09:51:46 -05:00
|
|
|
EXPORT_SYMBOL_GPL(get_task_pid);
|
2006-10-02 02:18:59 -07:00
|
|
|
|
2008-02-08 04:19:53 -08:00
|
|
|
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
{
|
|
|
|
struct task_struct *result;
|
|
|
|
rcu_read_lock();
|
|
|
|
result = pid_task(pid, type);
|
|
|
|
if (result)
|
|
|
|
get_task_struct(result);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return result;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2011-02-01 09:51:46 -05:00
|
|
|
EXPORT_SYMBOL_GPL(get_pid_task);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
struct pid *find_get_pid(pid_t nr)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct pid *pid;
|
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
rcu_read_lock();
|
2007-10-18 23:40:06 -07:00
|
|
|
pid = get_pid(find_vpid(nr));
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
rcu_read_unlock();
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
return pid;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2008-07-25 01:48:31 -07:00
|
|
|
EXPORT_SYMBOL_GPL(find_get_pid);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2007-10-18 23:40:06 -07:00
|
|
|
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
struct upid *upid;
|
|
|
|
pid_t nr = 0;
|
|
|
|
|
|
|
|
if (pid && ns->level <= pid->level) {
|
|
|
|
upid = &pid->numbers[ns->level];
|
|
|
|
if (upid->ns == ns)
|
|
|
|
nr = upid->nr;
|
|
|
|
}
|
|
|
|
return nr;
|
|
|
|
}
|
2012-05-24 10:37:59 -06:00
|
|
|
EXPORT_SYMBOL_GPL(pid_nr_ns);
|
2007-10-18 23:40:06 -07:00
|
|
|
|
2008-02-08 04:19:15 -08:00
|
|
|
pid_t pid_vnr(struct pid *pid)
|
|
|
|
{
|
2010-03-02 14:51:53 -08:00
|
|
|
return pid_nr_ns(pid, task_active_pid_ns(current));
|
2008-02-08 04:19:15 -08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(pid_vnr);
|
|
|
|
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 16:58:38 -07:00
|
|
|
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
|
|
|
|
struct pid_namespace *ns)
|
2007-10-18 23:40:19 -07:00
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 16:58:38 -07:00
|
|
|
pid_t nr = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
if (!ns)
|
2010-03-02 14:51:53 -08:00
|
|
|
ns = task_active_pid_ns(current);
|
2020-04-21 12:19:04 +02:00
|
|
|
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 16:58:38 -07:00
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return nr;
|
2007-10-18 23:40:19 -07:00
|
|
|
}
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-02 16:58:38 -07:00
|
|
|
EXPORT_SYMBOL(__task_pid_nr_ns);
|
2007-10-18 23:40:19 -07:00
|
|
|
|
2009-01-07 18:08:49 -08:00
|
|
|
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return ns_of_pid(task_pid(tsk));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(task_active_pid_ns);
|
|
|
|
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
/*
|
2008-10-16 19:02:37 +02:00
|
|
|
* Used by proc to find the first pid that is greater than or equal to nr.
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
*
|
2008-07-25 01:48:36 -07:00
|
|
|
* If there is a pid at nr this function is exactly the same as find_pid_ns.
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
*/
|
2007-10-18 23:40:06 -07:00
|
|
|
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
{
|
2017-11-17 15:30:30 -08:00
|
|
|
return idr_get_next(&ns->idr, &nr);
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
}
|
gfs2: Add glockfd debugfs file
When a process has a gfs2 file open, the file is keeping a reference on the
underlying gfs2 inode, and the inode is keeping the inode's iopen glock held in
shared mode. In other words, the process depends on the iopen glock of each
open gfs2 file. Expose those dependencies in a new "glockfd" debugfs file.
The new debugfs file contains one line for each gfs2 file descriptor,
specifying the tgid, file descriptor number, and glock name, e.g.,
1601 6 5/816d
This list is compiled by iterating all tasks on the system using find_ge_pid(),
and all file descriptors of each task using task_lookup_next_fd_rcu(). To make
that work from gfs2, export those two functions.
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2022-06-08 16:22:55 +02:00
|
|
|
EXPORT_SYMBOL_GPL(find_ge_pid);
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 02:17:04 -07:00
|
|
|
|
2020-10-17 16:14:54 -07:00
|
|
|
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
|
|
|
|
{
|
|
|
|
struct fd f;
|
|
|
|
struct pid *pid;
|
|
|
|
|
|
|
|
f = fdget(fd);
|
|
|
|
if (!f.file)
|
|
|
|
return ERR_PTR(-EBADF);
|
|
|
|
|
|
|
|
pid = pidfd_pid(f.file);
|
|
|
|
if (!IS_ERR(pid)) {
|
|
|
|
get_pid(pid);
|
|
|
|
*flags = f.file->f_flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
fdput(f);
|
|
|
|
return pid;
|
|
|
|
}
|
|
|
|
|
2021-10-11 15:32:44 +02:00
|
|
|
/**
|
|
|
|
* pidfd_get_task() - Get the task associated with a pidfd
|
|
|
|
*
|
|
|
|
* @pidfd: pidfd for which to get the task
|
|
|
|
* @flags: flags associated with this pidfd
|
|
|
|
*
|
|
|
|
* Return the task associated with @pidfd. The function takes a reference on
|
|
|
|
* the returned task. The caller is responsible for releasing that reference.
|
|
|
|
*
|
|
|
|
* Return: On success, the task_struct associated with the pidfd.
|
|
|
|
* On error, a negative errno number will be returned.
|
|
|
|
*/
|
|
|
|
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
|
|
|
|
{
|
|
|
|
unsigned int f_flags;
|
|
|
|
struct pid *pid;
|
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
pid = pidfd_get_pid(pidfd, &f_flags);
|
|
|
|
if (IS_ERR(pid))
|
|
|
|
return ERR_CAST(pid);
|
|
|
|
|
|
|
|
task = get_pid_task(pid, PIDTYPE_TGID);
|
|
|
|
put_pid(pid);
|
|
|
|
if (!task)
|
|
|
|
return ERR_PTR(-ESRCH);
|
|
|
|
|
|
|
|
*flags = f_flags;
|
|
|
|
return task;
|
|
|
|
}
|
|
|
|
|
2019-05-24 12:43:51 +02:00
|
|
|
/**
|
|
|
|
* pidfd_create() - Create a new pid file descriptor.
|
|
|
|
*
|
2020-09-02 12:21:27 +02:00
|
|
|
* @pid: struct pid that the pidfd will reference
|
|
|
|
* @flags: flags to pass
|
2019-05-24 12:43:51 +02:00
|
|
|
*
|
|
|
|
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
|
|
|
|
*
|
|
|
|
* Note, that this function can only be called after the fd table has
|
|
|
|
* been unshared to avoid leaking the pidfd to the new process.
|
|
|
|
*
|
2021-08-08 15:24:33 +10:00
|
|
|
* This symbol should not be explicitly exported to loadable modules.
|
|
|
|
*
|
2019-05-24 12:43:51 +02:00
|
|
|
* Return: On success, a cloexec pidfd is returned.
|
|
|
|
* On error, a negative errno number will be returned.
|
|
|
|
*/
|
2024-01-25 17:17:34 +01:00
|
|
|
static int pidfd_create(struct pid *pid, unsigned int flags)
|
2019-05-24 12:43:51 +02:00
|
|
|
{
|
2023-03-27 20:22:51 +02:00
|
|
|
int pidfd;
|
|
|
|
struct file *pidfd_file;
|
2019-05-24 12:43:51 +02:00
|
|
|
|
2023-03-27 20:22:51 +02:00
|
|
|
pidfd = pidfd_prepare(pid, flags, &pidfd_file);
|
|
|
|
if (pidfd < 0)
|
|
|
|
return pidfd;
|
2021-08-08 15:25:05 +10:00
|
|
|
|
2023-03-27 20:22:51 +02:00
|
|
|
fd_install(pidfd, pidfd_file);
|
|
|
|
return pidfd;
|
2019-05-24 12:43:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2023-09-11 23:08:22 -07:00
|
|
|
* sys_pidfd_open() - Open new pid file descriptor.
|
2019-05-24 12:43:51 +02:00
|
|
|
*
|
|
|
|
* @pid: pid for which to retrieve a pidfd
|
|
|
|
* @flags: flags to pass
|
|
|
|
*
|
|
|
|
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
|
2024-01-31 14:26:02 +01:00
|
|
|
* the task identified by @pid. Without PIDFD_THREAD flag the target task
|
|
|
|
* must be a thread-group leader.
|
2019-05-24 12:43:51 +02:00
|
|
|
*
|
|
|
|
* Return: On success, a cloexec pidfd is returned.
|
|
|
|
* On error, a negative errno number will be returned.
|
|
|
|
*/
|
|
|
|
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
|
|
|
|
{
|
2019-10-17 12:18:32 +02:00
|
|
|
int fd;
|
2019-05-24 12:43:51 +02:00
|
|
|
struct pid *p;
|
|
|
|
|
2024-01-31 14:26:02 +01:00
|
|
|
if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
|
2019-05-24 12:43:51 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (pid <= 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
p = find_get_pid(pid);
|
|
|
|
if (!p)
|
|
|
|
return -ESRCH;
|
|
|
|
|
2021-08-08 15:25:05 +10:00
|
|
|
fd = pidfd_create(p, flags);
|
2019-05-24 12:43:51 +02:00
|
|
|
|
|
|
|
put_pid(p);
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
2017-11-17 15:30:30 -08:00
|
|
|
void __init pid_idr_init(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2016-01-30 10:04:17 +08:00
|
|
|
/* Verify no one has done anything silly: */
|
2017-11-17 15:30:34 -08:00
|
|
|
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
|
2012-12-21 20:27:12 -08:00
|
|
|
|
2010-05-26 14:44:06 -07:00
|
|
|
/* bump default and minimum pid_max based on number of cpus */
|
|
|
|
pid_max = min(pid_max_max, max_t(int, pid_max,
|
|
|
|
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
|
|
|
|
pid_max_min = max_t(int, pid_max_min,
|
|
|
|
PIDS_PER_CPU_MIN * num_possible_cpus());
|
|
|
|
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
|
|
|
|
|
2017-11-17 15:30:30 -08:00
|
|
|
idr_init(&init_pid_ns.idr);
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 02:31:42 -08:00
|
|
|
|
2023-06-30 09:46:17 +02:00
|
|
|
init_pid_ns.pid_cachep = kmem_cache_create("pid",
|
2023-07-01 08:44:44 +02:00
|
|
|
struct_size_t(struct pid, numbers, 1),
|
2023-06-30 09:46:17 +02:00
|
|
|
__alignof__(struct pid),
|
|
|
|
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
|
|
|
|
NULL);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2020-01-07 09:59:25 -08:00
|
|
|
|
|
|
|
static struct file *__pidfd_fget(struct task_struct *task, int fd)
|
|
|
|
{
|
|
|
|
struct file *file;
|
|
|
|
int ret;
|
|
|
|
|
2020-12-03 14:12:00 -06:00
|
|
|
ret = down_read_killable(&task->signal->exec_update_lock);
|
2020-01-07 09:59:25 -08:00
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
|
|
|
|
file = fget_task(task, fd);
|
|
|
|
else
|
|
|
|
file = ERR_PTR(-EPERM);
|
|
|
|
|
2020-12-03 14:12:00 -06:00
|
|
|
up_read(&task->signal->exec_update_lock);
|
2020-01-07 09:59:25 -08:00
|
|
|
|
2024-02-07 10:19:29 +01:00
|
|
|
if (!file) {
|
|
|
|
/*
|
|
|
|
* It is possible that the target thread is exiting; it can be
|
|
|
|
* either:
|
|
|
|
* 1. before exit_signals(), which gives a real fd
|
|
|
|
* 2. before exit_files() takes the task_lock() gives a real fd
|
|
|
|
* 3. after exit_files() releases task_lock(), ->files is NULL;
|
|
|
|
* this has PF_EXITING, since it was set in exit_signals(),
|
|
|
|
* __pidfd_fget() returns EBADF.
|
|
|
|
* In case 3 we get EBADF, but that really means ESRCH, since
|
|
|
|
* the task is currently exiting and has freed its files
|
|
|
|
* struct, so we fix it up.
|
|
|
|
*/
|
|
|
|
if (task->flags & PF_EXITING)
|
|
|
|
file = ERR_PTR(-ESRCH);
|
|
|
|
else
|
|
|
|
file = ERR_PTR(-EBADF);
|
|
|
|
}
|
|
|
|
|
|
|
|
return file;
|
2020-01-07 09:59:25 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int pidfd_getfd(struct pid *pid, int fd)
|
|
|
|
{
|
|
|
|
struct task_struct *task;
|
|
|
|
struct file *file;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
task = get_pid_task(pid, PIDTYPE_PID);
|
|
|
|
if (!task)
|
|
|
|
return -ESRCH;
|
|
|
|
|
|
|
|
file = __pidfd_fget(task, fd);
|
|
|
|
put_task_struct(task);
|
|
|
|
if (IS_ERR(file))
|
|
|
|
return PTR_ERR(file);
|
|
|
|
|
2023-11-30 13:49:11 +01:00
|
|
|
ret = receive_fd(file, NULL, O_CLOEXEC);
|
2020-06-09 16:21:38 -07:00
|
|
|
fput(file);
|
2020-01-07 09:59:25 -08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_pidfd_getfd() - Get a file descriptor from another process
|
|
|
|
*
|
|
|
|
* @pidfd: the pidfd file descriptor of the process
|
|
|
|
* @fd: the file descriptor number to get
|
|
|
|
* @flags: flags on how to get the fd (reserved)
|
|
|
|
*
|
|
|
|
* This syscall gets a copy of a file descriptor from another process
|
|
|
|
* based on the pidfd, and file descriptor number. It requires that
|
|
|
|
* the calling process has the ability to ptrace the process represented
|
|
|
|
* by the pidfd. The process which is having its file descriptor copied
|
|
|
|
* is otherwise unaffected.
|
|
|
|
*
|
|
|
|
* Return: On success, a cloexec file descriptor is returned.
|
|
|
|
* On error, a negative errno number will be returned.
|
|
|
|
*/
|
|
|
|
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
|
|
|
|
unsigned int, flags)
|
|
|
|
{
|
|
|
|
struct pid *pid;
|
|
|
|
struct fd f;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* flags is currently unused - make sure it's unset */
|
|
|
|
if (flags)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
f = fdget(pidfd);
|
|
|
|
if (!f.file)
|
|
|
|
return -EBADF;
|
|
|
|
|
|
|
|
pid = pidfd_pid(f.file);
|
|
|
|
if (IS_ERR(pid))
|
|
|
|
ret = PTR_ERR(pid);
|
|
|
|
else
|
|
|
|
ret = pidfd_getfd(pid, fd);
|
|
|
|
|
|
|
|
fdput(f);
|
|
|
|
return ret;
|
|
|
|
}
|