Merge branch 'vfs-6.14.misc' into vfs.all

This commit is contained in:
Christian Brauner 2025-01-10 16:17:37 +01:00
commit c9e970fded
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
26 changed files with 373 additions and 115 deletions

View File

@ -12,21 +12,10 @@ returns a list of extents.
Request Basics Request Basics
-------------- --------------
A fiemap request is encoded within struct fiemap:: A fiemap request is encoded within struct fiemap:
struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at
* which to start mapping (in) */
__u64 fm_length; /* logical length of mapping which
* userspace cares about (in) */
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
__u32 fm_mapped_extents; /* number of extents that were
* mapped (out) */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved;
struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
};
.. kernel-doc:: include/uapi/linux/fiemap.h
:identifiers: fiemap
fm_start, and fm_length specify the logical range within the file fm_start, and fm_length specify the logical range within the file
which the process would like mappings for. Extents returned mirror which the process would like mappings for. Extents returned mirror
@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR
If this flag is set, the extents returned will describe the inodes If this flag is set, the extents returned will describe the inodes
extended attribute lookup tree, instead of its data tree. extended attribute lookup tree, instead of its data tree.
FIEMAP_FLAG_CACHE
This flag requests caching of the extents.
Extent Mapping Extent Mapping
-------------- --------------
@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST
flag set (see the next section on extent flags). flag set (see the next section on extent flags).
Each extent is described by a single fiemap_extent structure as Each extent is described by a single fiemap_extent structure as
returned in fm_extents:: returned in fm_extents:
struct fiemap_extent { .. kernel-doc:: include/uapi/linux/fiemap.h
__u64 fe_logical; /* logical offset in bytes for the start of :identifiers: fiemap_extent
* the extent */
__u64 fe_physical; /* physical offset in bytes for the start
* of the extent */
__u64 fe_length; /* length in bytes for the extent */
__u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
__u32 fe_reserved[3];
};
All offsets and lengths are in bytes and mirror those on disk. It is valid All offsets and lengths are in bytes and mirror those on disk. It is valid
for an extents logical offset to start before the request or its logical for an extents logical offset to start before the request or its logical
@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED
userspace would be highly inefficient, the kernel will try to merge most userspace would be highly inefficient, the kernel will try to merge most
adjacent blocks into 'extents'. adjacent blocks into 'extents'.
FIEMAP_EXTENT_SHARED
This flag is set to request that space be shared with other files.
VFS -> File System Implementation VFS -> File System Implementation
--------------------------------- ---------------------------------
@ -191,14 +176,10 @@ each discovered extent::
u64 len); u64 len);
->fiemap is passed struct fiemap_extent_info which describes the ->fiemap is passed struct fiemap_extent_info which describes the
fiemap request:: fiemap request:
struct fiemap_extent_info { .. kernel-doc:: include/linux/fiemap.h
unsigned int fi_flags; /* Flags as passed from user */ :identifiers: fiemap_extent_info
unsigned int fi_extents_mapped; /* Number of mapped extents */
unsigned int fi_extents_max; /* Size of fiemap_extent array */
struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
};
It is intended that the file system should not need to access any of this It is intended that the file system should not need to access any of this
structure directly. Filesystem handlers should be tolerant to signals and return structure directly. Filesystem handlers should be tolerant to signals and return

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config VBOXGUEST config VBOXGUEST
tristate "Virtual Box Guest integration support" tristate "Virtual Box Guest integration support"
depends on X86 && PCI && INPUT depends on (ARM64 || X86) && PCI && INPUT
help help
This is a driver for the Virtual Box Guest PCI device used in This is a driver for the Virtual Box Guest PCI device used in
Virtual Box virtual machines. Enabling this driver will add Virtual Box virtual machines. Enabling this driver will add

View File

@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_ENCRYPTED(inode)) { if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations; inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) { } else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size, nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1); sizeof(ei->i_data) - 1);
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else { } else {
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} }

View File

@ -3418,7 +3418,6 @@ retry:
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} else { } else {
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
inode->i_link = (char *)&EXT4_I(inode)->i_data;
} }
} }
@ -3434,6 +3433,9 @@ retry:
disk_link.len); disk_link.len);
inode->i_size = disk_link.len - 1; inode->i_size = disk_link.len - 1;
EXT4_I(inode)->i_disksize = inode->i_size; EXT4_I(inode)->i_disksize = inode->i_size;
if (!IS_ENCRYPTED(inode))
inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
inode->i_size);
} }
err = ext4_add_nondir(handle, dentry, &inode); err = ext4_add_nondir(handle, dentry, &inode);
if (handle) if (handle)

View File

@ -279,10 +279,6 @@ repeat:
if (nr < fdt->max_fds) if (nr < fdt->max_fds)
return 0; return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
if (unlikely(files->resize_in_progress)) { if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
wait_event(files->resize_wait, !files->resize_in_progress); wait_event(files->resize_wait, !files->resize_in_progress);
@ -290,6 +286,10 @@ repeat:
goto repeat; goto repeat;
} }
/* Can we expand? */
if (unlikely(nr >= sysctl_nr_open))
return -EMFILE;
/* All good, so we try */ /* All good, so we try */
files->resize_in_progress = true; files->resize_in_progress = true;
error = expand_fdtable(files, nr); error = expand_fdtable(files, nr);
@ -1231,17 +1231,9 @@ __releases(&files->file_lock)
/* /*
* We need to detect attempts to do dup2() over allocated but still * We need to detect attempts to do dup2() over allocated but still
* not finished descriptor. NB: OpenBSD avoids that at the price of * not finished descriptor.
* extra work in their equivalent of fget() - they insert struct *
* file immediately after grabbing descriptor, mark it larval if * POSIX is silent on the issue, we return -EBUSY.
* more work (e.g. actual opening) is needed and make sure that
* fget() treats larval files as absent. Potentially interesting,
* but while extra work in fget() is trivial, locking implications
* and amount of surgery on open()-related paths in VFS are not.
* FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
* deadlocks in rather amusing ways, AFAICS. All of that is out of
* scope of POSIX or SUS, since neither considers shared descriptor
* tables and this condition does not arise without those.
*/ */
fdt = files_fdtable(files); fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds); fd = array_index_nospec(fd, fdt->max_fds);

View File

@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = {
.data = &sysctl_nr_open, .data = &sysctl_nr_open,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_douintvec_minmax,
.extra1 = &sysctl_nr_open_min, .extra1 = &sysctl_nr_open_min,
.extra2 = &sysctl_nr_open_max, .extra2 = &sysctl_nr_open_max,
}, },
@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work)
__fput(container_of(work, struct file, f_task_work)); __fput(container_of(work, struct file, f_task_work));
} }
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
/* /*
* If kernel thread really needs to have the final fput() it has done * If kernel thread really needs to have the final fput() it has done
* to complete, call this. The only user right now is the boot - we * to complete, call this. The only user right now is the boot - we
@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work)
void flush_delayed_fput(void) void flush_delayed_fput(void)
{ {
delayed_fput(NULL); delayed_fput(NULL);
flush_delayed_work(&delayed_fput_work);
} }
EXPORT_SYMBOL_GPL(flush_delayed_fput); EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file) void fput(struct file *file)
{ {
if (file_ref_put(&file->f_ref)) { if (file_ref_put(&file->f_ref)) {

View File

@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc)
if (log) { if (log) {
if (refcount_dec_and_test(&log->usage)) { if (refcount_dec_and_test(&log->usage)) {
fc->log.log = NULL; fc->log.log = NULL;
for (i = 0; i <= 7; i++) for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
if (log->need_free & (1 << i)) if (log->need_free & (1 << i))
kfree(log->buffer[i]); kfree(log->buffer[i]);
kfree(log); kfree(log);

View File

@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
getname(newname), 0); getname(newname), 0);
} }
int readlink_copy(char __user *buffer, int buflen, const char *link) int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
{ {
int len = PTR_ERR(link); int copylen;
if (IS_ERR(link))
goto out;
len = strlen(link); copylen = linklen;
if (len > (unsigned) buflen) if (unlikely(copylen > (unsigned) buflen))
len = buflen; copylen = buflen;
if (copy_to_user(buffer, link, len)) if (copy_to_user(buffer, link, copylen))
len = -EFAULT; copylen = -EFAULT;
out: return copylen;
return len;
} }
/** /**
@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
const char *link; const char *link;
int res; int res;
if (inode->i_opflags & IOP_CACHED_LINK)
return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
if (unlikely(inode->i_op->readlink)) if (unlikely(inode->i_op->readlink))
return inode->i_op->readlink(dentry, buffer, buflen); return inode->i_op->readlink(dentry, buffer, buflen);
@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(link)) if (IS_ERR(link))
return PTR_ERR(link); return PTR_ERR(link);
} }
res = readlink_copy(buffer, buflen, link); res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }
@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{ {
const char *link;
int res;
DEFINE_DELAYED_CALL(done); DEFINE_DELAYED_CALL(done);
int res = readlink_copy(buffer, buflen, link = page_get_link(dentry, d_inode(dentry), &done);
page_get_link(dentry, d_inode(dentry), res = PTR_ERR(link);
&done)); if (!IS_ERR(link))
res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }

View File

@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
size_t total_len = iov_iter_count(to); size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp; struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data; struct pipe_inode_info *pipe = filp->private_data;
bool was_full, wake_next_reader = false; bool wake_writer = false, wake_next_reader = false;
ssize_t ret; ssize_t ret;
/* Null read succeeds. */ /* Null read succeeds. */
@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
mutex_lock(&pipe->mutex); mutex_lock(&pipe->mutex);
/* /*
* We only wake up writers if the pipe was full when we started * We only wake up writers if the pipe was full when we started reading
* reading in order to avoid unnecessary wakeups. * and it is no longer full after reading to avoid unnecessary wakeups.
* *
* But when we do wake up writers, we do so using a sync wakeup * But when we do wake up writers, we do so using a sync wakeup
* (WF_SYNC), because we want them to get going and generate more * (WF_SYNC), because we want them to get going and generate more
* data for us. * data for us.
*/ */
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) { for (;;) {
/* Read ->head with a barrier vs post_one_notification() */ /* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head); unsigned int head = smp_load_acquire(&pipe->head);
@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
buf->len = 0; buf->len = 0;
} }
if (!buf->len) if (!buf->len) {
wake_writer |= pipe_full(head, tail, pipe->max_usage);
tail = pipe_update_tail(pipe, buf, tail); tail = pipe_update_tail(pipe, buf, tail);
}
total_len -= chars; total_len -= chars;
if (!total_len) if (!total_len)
break; /* common path: read succeeded */ break; /* common path: read succeeded */
@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
* _very_ unlikely case that the pipe was full, but we got * _very_ unlikely case that the pipe was full, but we got
* no data. * no data.
*/ */
if (unlikely(was_full)) if (unlikely(wake_writer))
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS; return -ERESTARTSYS;
mutex_lock(&pipe->mutex); wake_writer = false;
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true; wake_next_reader = true;
mutex_lock(&pipe->mutex);
} }
if (pipe_empty(pipe->head, pipe->tail)) if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false; wake_next_reader = false;
mutex_unlock(&pipe->mutex); mutex_unlock(&pipe->mutex);
if (was_full) if (wake_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (wake_next_reader) if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);

View File

@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list)
continue; continue;
} else if (child->mnt.mnt_flags & MNT_UMOUNT) { } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
/* /*
* We have come accross an partially unmounted * We have come across a partially unmounted
* mount in list that has not been visited yet. * mount in a list that has not been visited
* Remember it has been visited and continue * yet. Remember it has been visited and
* about our merry way. * continue about our merry way.
*/ */
list_add_tail(&child->mnt_umounting, &visited); list_add_tail(&child->mnt_umounting, &visited);
continue; continue;

View File

@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is * a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently. * safe because the task has stopped executing permanently.
*/ */
if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
if (try_get_task_stack(task)) { if (try_get_task_stack(task)) {
eip = KSTK_EIP(task); eip = KSTK_EIP(task);
esp = KSTK_ESP(task); esp = KSTK_ESP(task);

View File

@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops); res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0) if (res >= 0)
res = readlink_copy(buffer, buflen, name); res = readlink_copy(buffer, buflen, name, strlen(name));
} }
put_task_struct(task); put_task_struct(task);
return res; return res;

View File

@ -1,6 +1,6 @@
config VBOXSF_FS config VBOXSF_FS
tristate "VirtualBox guest shared folder (vboxsf) support" tristate "VirtualBox guest shared folder (vboxsf) support"
depends on X86 && VBOXGUEST depends on (ARM64 || X86) && VBOXGUEST
select NLS select NLS
help help
VirtualBox hosts can share folders with guests, this driver VirtualBox hosts can share folders with guests, this driver

View File

@ -5,12 +5,18 @@
#include <uapi/linux/fiemap.h> #include <uapi/linux/fiemap.h>
#include <linux/fs.h> #include <linux/fs.h>
/**
* struct fiemap_extent_info - fiemap request to a filesystem
* @fi_flags: Flags as passed from user
* @fi_extents_mapped: Number of mapped extents
* @fi_extents_max: Size of fiemap_extent array
* @fi_extents_start: Start of fiemap_extent array
*/
struct fiemap_extent_info { struct fiemap_extent_info {
unsigned int fi_flags; /* Flags as passed from user */ unsigned int fi_flags;
unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_mapped;
unsigned int fi_extents_max; /* Size of fiemap_extent array */ unsigned int fi_extents_max;
struct fiemap_extent __user *fi_extents_start; /* Start of struct fiemap_extent __user *fi_extents_start;
fiemap_extent array */
}; };
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,

View File

@ -322,6 +322,7 @@ struct readahead_control;
#define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_NOWAIT (__force int) RWF_NOWAIT
#define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_APPEND (__force int) RWF_APPEND
#define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_ATOMIC (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE
/* non-RWF related bits - start at 16 */ /* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD (1 << 16) #define IOCB_EVENTFD (1 << 16)
@ -357,6 +358,7 @@ struct readahead_control;
{ IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \ { IOCB_APPEND, "APPEND" }, \
{ IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_ATOMIC, "ATOMIC" }, \
{ IOCB_DONTCACHE, "DONTCACHE" }, \
{ IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \ { IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \ { IOCB_WRITE, "WRITE" }, \
@ -626,6 +628,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_XATTR 0x0008 #define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010 #define IOP_DEFAULT_READLINK 0x0010
#define IOP_MGTIME 0x0020 #define IOP_MGTIME 0x0020
#define IOP_CACHED_LINK 0x0040
/* /*
* Keep mostly read-only and often accessed (especially for * Keep mostly read-only and often accessed (especially for
@ -723,7 +726,10 @@ struct inode {
}; };
struct file_lock_context *i_flctx; struct file_lock_context *i_flctx;
struct address_space i_data; struct address_space i_data;
union {
struct list_head i_devices; struct list_head i_devices;
int i_linklen;
};
union { union {
struct pipe_inode_info *i_pipe; struct pipe_inode_info *i_pipe;
struct cdev *i_cdev; struct cdev *i_cdev;
@ -749,6 +755,13 @@ struct inode {
void *i_private; /* fs or device private pointer */ void *i_private; /* fs or device private pointer */
} __randomize_layout; } __randomize_layout;
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
inode->i_link = link;
inode->i_linklen = linklen;
inode->i_opflags |= IOP_CACHED_LINK;
}
/* /*
* Get bit address from inode->i_state to use with wait_var_event() * Get bit address from inode->i_state to use with wait_var_event()
* infrastructre. * infrastructre.
@ -2127,6 +2140,8 @@ struct file_operations {
#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
/* Supports asynchronous lock callbacks */ /* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7))
/* Wrap a directory iterator that needs exclusive inode access */ /* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *, int wrap_directory_iterator(struct file *, struct dir_context *,
@ -3351,7 +3366,7 @@ extern const struct file_operations generic_ro_fops;
#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
extern int readlink_copy(char __user *, int, const char *); extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int); extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *, extern const char *page_get_link(struct dentry *, struct inode *,
struct delayed_call *); struct delayed_call *);
@ -3614,6 +3629,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
if (flags & RWF_DONTCACHE) {
/* file system must support it */
if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
return -EOPNOTSUPP;
/* DAX mappings not supported */
if (IS_DAX(ki->ki_filp->f_mapping->host))
return -EOPNOTSUPP;
}
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
if (flags & RWF_SYNC) if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC; kiocb_flags |= IOCB_DSYNC;

View File

@ -75,7 +75,7 @@ struct vfsmount {
static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
{ {
/* Pairs with smp_store_release() in do_idmap_mount(). */ /* Pairs with smp_store_release() in do_idmap_mount(). */
return smp_load_acquire(&mnt->mnt_idmap); return READ_ONCE(mnt->mnt_idmap);
} }
extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write(struct vfsmount *mnt);

View File

@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
({ \ ({ \
unsigned __seq; \ unsigned __seq; \
\ \
while ((__seq = seqprop_sequence(s)) & 1) \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \
cpu_relax(); \ cpu_relax(); \
\ \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \

View File

@ -14,37 +14,56 @@
#include <linux/types.h> #include <linux/types.h>
/**
* struct fiemap_extent - description of one fiemap extent
* @fe_logical: byte offset of the extent in the file
* @fe_physical: byte offset of extent on disk
* @fe_length: length in bytes for this extent
* @fe_flags: FIEMAP_EXTENT_* flags for this extent
*/
struct fiemap_extent { struct fiemap_extent {
__u64 fe_logical; /* logical offset in bytes for the start of __u64 fe_logical;
* the extent from the beginning of the file */ __u64 fe_physical;
__u64 fe_physical; /* physical offset in bytes for the start __u64 fe_length;
* of the extent from the beginning of the disk */ /* private: */
__u64 fe_length; /* length in bytes for this extent */
__u64 fe_reserved64[2]; __u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ /* public: */
__u32 fe_flags;
/* private: */
__u32 fe_reserved[3]; __u32 fe_reserved[3];
}; };
/**
* struct fiemap - file extent mappings
* @fm_start: byte offset (inclusive) at which to start mapping (in)
* @fm_length: logical length of mapping which userspace wants (in)
* @fm_flags: FIEMAP_FLAG_* flags for request (in/out)
* @fm_mapped_extents: number of extents that were mapped (out)
* @fm_extent_count: size of fm_extents array (in)
* @fm_extents: array of mapped extents (out)
*/
struct fiemap { struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at __u64 fm_start;
* which to start mapping (in) */ __u64 fm_length;
__u64 fm_length; /* logical length of mapping which __u32 fm_flags;
* userspace wants (in) */ __u32 fm_mapped_extents;
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ __u32 fm_extent_count;
__u32 fm_mapped_extents;/* number of extents that were mapped (out) */ /* private: */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved; __u32 fm_reserved;
struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ /* public: */
struct fiemap_extent fm_extents[];
}; };
#define FIEMAP_MAX_OFFSET (~0ULL) #define FIEMAP_MAX_OFFSET (~0ULL)
/* flags used in fm_flags: */
#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */
#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
/* flags used in fe_flags: */
#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.

View File

@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t;
/* Atomic Write */ /* Atomic Write */
#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
/* buffered IO that drops the cache after reading or writing data */
#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080)
/* mask of flags supported by the kernel */ /* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
RWF_DONTCACHE)
#define PROCFS_IOCTL_MAGIC 'f' #define PROCFS_IOCTL_MAGIC 'f'

View File

@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit /= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE;
page = buf->page; page = buf->page;
bit += page->index; bit += page->private;
set_bit(bit, wqueue->notes_bitmap); set_bit(bit, wqueue->notes_bitmap);
generic_pipe_buf_release(pipe, buf); generic_pipe_buf_release(pipe, buf);
@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i] = alloc_page(GFP_KERNEL); pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i]) if (!pages[i])
goto error_p; goto error_p;
pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE;
} }
bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);

View File

@ -3917,6 +3917,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len; int len;
struct inode *inode; struct inode *inode;
struct folio *folio; struct folio *folio;
char *link;
len = strlen(symname) + 1; len = strlen(symname) + 1;
if (len > PAGE_SIZE) if (len > PAGE_SIZE)
@ -3938,12 +3939,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1; inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) { if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL); link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) { if (!link) {
error = -ENOMEM; error = -ENOMEM;
goto out_remove_offset; goto out_remove_offset;
} }
inode->i_op = &shmem_short_symlink_operations; inode->i_op = &shmem_short_symlink_operations;
inode_set_cached_link(inode, link, len - 1);
} else { } else {
inode_nohighmem(inode); inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->a_ops = &shmem_aops;

View File

@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME,
d_inode(dentry)->i_ino); d_inode(dentry)->i_ino);
if (res > 0 && res < sizeof(name)) if (res > 0 && res < sizeof(name))
res = readlink_copy(buffer, buflen, name); res = readlink_copy(buffer, buflen, name, strlen(name));
else else
res = -ENOENT; res = -ENOENT;

View File

@ -0,0 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS = $(KHDR_INCLUDES)
TEST_GEN_PROGS := stackdump_test
TEST_FILES := stackdump
include ../lib.mk

View File

@ -0,0 +1,50 @@
coredump selftest
=================
Background context
------------------
`coredump` is a feature which dumps a process's memory space when the process terminates
unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default,
`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a
different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a
user-space program by writing the pipe symbol (`|`) followed by the command to be executed to
`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`.
The piped user program may be interested in reading the stack pointers of the crashed process. The
crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in
`/proc/$PID/stat`. See `man 5 proc` for all the details.
The problem
-----------
While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field
reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid
value.
However, this was broken in the past and `kstkesp` was zero even during coredump:
* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to
always be zero
* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the
coredumping thread. However, other threads in a coredumping process still had the problem.
* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed
for all threads in a coredumping process.
* commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") broke it again
for the other threads in a coredumping process.
The problem has been fixed now, but considering the history, it may appear again in the future.
The goal of this test
---------------------
This test detects problem with reading `kstkesp` during coredump by doing the following:
#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script
reads the stack pointers of all threads of crashed processes.
#. Spawn a child process who creates some threads and then crashes.
#. Read the output from the "stackdump" script, and make sure all stack pointer values are
non-zero.

View File

@ -0,0 +1,14 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
CRASH_PROGRAM_ID=$1
STACKDUMP_FILE=$2
TMP=$(mktemp)
for t in /proc/$CRASH_PROGRAM_ID/task/*; do
tid=$(basename $t)
cat /proc/$tid/stat | awk '{print $29}' >> $TMP
done
mv $TMP $STACKDUMP_FILE

View File

@ -0,0 +1,151 @@
// SPDX-License-Identifier: GPL-2.0
#include <fcntl.h>
#include <libgen.h>
#include <linux/limits.h>
#include <pthread.h>
#include <string.h>
#include <sys/resource.h>
#include <unistd.h>
#include "../kselftest_harness.h"
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
#define NUM_THREAD_SPAWN 128
static void *do_nothing(void *)
{
while (1)
pause();
}
static void crashing_child(void)
{
pthread_t thread;
int i;
for (i = 0; i < NUM_THREAD_SPAWN; ++i)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
i = *(int *)NULL;
}
FIXTURE(coredump)
{
char original_core_pattern[256];
};
FIXTURE_SETUP(coredump)
{
char buf[PATH_MAX];
FILE *file;
char *dir;
int ret;
file = fopen("/proc/sys/kernel/core_pattern", "r");
ASSERT_NE(NULL, file);
ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
ASSERT_TRUE(ret || feof(file));
ASSERT_LT(ret, sizeof(self->original_core_pattern));
self->original_core_pattern[ret] = '\0';
ret = fclose(file);
ASSERT_EQ(0, ret);
}
FIXTURE_TEARDOWN(coredump)
{
const char *reason;
FILE *file;
int ret;
unlink(STACKDUMP_FILE);
file = fopen("/proc/sys/kernel/core_pattern", "w");
if (!file) {
reason = "Unable to open core_pattern";
goto fail;
}
ret = fprintf(file, "%s", self->original_core_pattern);
if (ret < 0) {
reason = "Unable to write to core_pattern";
goto fail;
}
ret = fclose(file);
if (ret) {
reason = "Unable to close core_pattern";
goto fail;
}
return;
fail:
/* This should never happen */
fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason);
}
TEST_F(coredump, stackdump)
{
struct sigaction action = {};
unsigned long long stack;
char *test_dir, *line;
size_t line_length;
char buf[PATH_MAX];
int ret, i;
FILE *file;
pid_t pid;
/*
* Step 1: Setup core_pattern so that the stackdump script is executed when the child
* process crashes
*/
ret = readlink("/proc/self/exe", buf, sizeof(buf));
ASSERT_NE(-1, ret);
ASSERT_LT(ret, sizeof(buf));
buf[ret] = '\0';
test_dir = dirname(buf);
file = fopen("/proc/sys/kernel/core_pattern", "w");
ASSERT_NE(NULL, file);
ret = fprintf(file, "|%1$s/%2$s %%P %1$s/%3$s", test_dir, STACKDUMP_SCRIPT, STACKDUMP_FILE);
ASSERT_LT(0, ret);
ret = fclose(file);
ASSERT_EQ(0, ret);
/* Step 2: Create a process who spawns some threads then crashes */
pid = fork();
ASSERT_TRUE(pid >= 0);
if (pid == 0)
crashing_child();
/*
* Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file
*/
for (i = 0; i < 10; ++i) {
file = fopen(STACKDUMP_FILE, "r");
if (file)
break;
sleep(1);
}
ASSERT_NE(file, NULL);
/* Step 4: Make sure all stack pointer values are non-zero */
for (i = 0; -1 != getline(&line, &line_length, file); ++i) {
stack = strtoull(line, NULL, 10);
ASSERT_NE(stack, 0);
}
ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN);
fclose(file);
}
TEST_HARNESS_MAIN