linux-stable/include/linux/fdtable.h
Al Viro 3812169643 close_range(): fix the logics in descriptor table trimming
commit 678379e1d4 upstream.

Cloning a descriptor table picks the size that would cover all currently
opened files.  That's fine for clone() and unshare(), but for close_range()
there's an additional twist - we clone before we close, and it would be
a shame to have
	close_range(3, ~0U, CLOSE_RANGE_UNSHARE)
leave us with a huge descriptor table when we are not going to keep
anything past stderr, just because some large file descriptor used to
be open before our call has taken it out.

Unfortunately, it had been dealt with in an inherently racy way -
sane_fdtable_size() gets a "don't copy anything past that" argument
(passed via unshare_fd() and dup_fd()), close_range() decides how much
should be trimmed and passes that to unshare_fd().

The problem is, a range that used to extend to the end of descriptor
table back when close_range() had looked at it might very well have stuff
grown after it by the time dup_fd() has allocated a new files_struct
and started to figure out the capacity of fdtable to be attached to that.

That leads to interesting pathological cases; at the very least it's a
QoI issue, since unshare(CLONE_FILES) is atomic in a sense that it takes
a snapshot of descriptor table one might have observed at some point.
Since CLOSE_RANGE_UNSHARE close_range() is supposed to be a combination
of unshare(CLONE_FILES) with plain close_range(), ending up with a
weird state that would never occur with unshare(2) is confusing, to put
it mildly.

It's not hard to get rid of - all it takes is passing both ends of the
range down to sane_fdtable_size().  There we are under ->files_lock,
so the race is trivially avoided.

So we do the following:
	* switch close_files() from calling unshare_fd() to calling
dup_fd().
	* undo the calling convention change done to unshare_fd() in
60997c3d45 "close_range: add CLOSE_RANGE_UNSHARE"
	* introduce struct fd_range, pass a pointer to that to dup_fd()
and sane_fdtable_size() instead of "trim everything past that point"
they are currently getting.  NULL means "we are not going to be punching
any holes"; NR_OPEN_MAX is gone.
	* make sane_fdtable_size() use find_last_bit() instead of
open-coding it; it's easier to follow that way.
	* while we are at it, have dup_fd() report errors by returning
ERR_PTR(), no need to use a separate int *errorp argument.

Fixes: 60997c3d45 "close_range: add CLOSE_RANGE_UNSHARE"
Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-10-17 15:22:03 +02:00

135 lines
3.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* descriptor table internals; you almost certainly want file.h instead.
*/
#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H
#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/atomic.h>
/*
* The default fd array needs to be at least BITS_PER_LONG,
* as this is the granularity returned by copy_fdset().
*/
#define NR_OPEN_DEFAULT BITS_PER_LONG
struct fdtable {
unsigned int max_fds;
struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
};
static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
{
return test_bit(fd, fdt->close_on_exec);
}
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
return test_bit(fd, fdt->open_fds);
}
/*
* Open file table structure
*/
struct files_struct {
/*
* read mostly part
*/
atomic_t count;
bool resize_in_progress;
wait_queue_head_t resize_wait;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
* written part on a separate cache line in SMP
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
unsigned int next_fd;
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};
struct file_operations;
struct vfsmount;
struct dentry;
#define rcu_dereference_check_fdtable(files, fdtfd) \
rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))
#define files_fdtable(files) \
rcu_dereference_check_fdtable((files), (files)->fdt)
/*
* The caller must ensure that fd table isn't shared or hold rcu or file lock
*/
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
if (fd < fdt->max_fds) {
fd = array_index_nospec(fd, fdt->max_fds);
return rcu_dereference_raw(fdt->fd[fd]);
}
return NULL;
}
static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
"suspicious rcu_dereference_check() usage");
return files_lookup_fd_raw(files, fd);
}
static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"suspicious rcu_dereference_check() usage");
return files_lookup_fd_raw(files, fd);
}
static inline struct file *lookup_fd_rcu(unsigned int fd)
{
return files_lookup_fd_rcu(current->files, fd);
}
struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
struct task_struct;
void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct fd_range {
unsigned int from, to;
};
struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
int (*)(const void *, struct file *, unsigned),
const void *);
extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *close_fd_get_file(unsigned int fd);
extern struct kmem_cache *files_cachep;
#endif /* __LINUX_FDTABLE_H */