Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git

This commit is contained in:
Stephen Rothwell 2025-01-14 09:14:46 +11:00
commit 8eb728b56b
30 changed files with 675 additions and 114 deletions

View File

@ -1257,7 +1257,7 @@ out_free_interp:
}
reloc_func_desc = interp_load_addr;
allow_write_access(interpreter);
exe_file_allow_write_access(interpreter);
fput(interpreter);
kfree(interp_elf_ex);
@ -1354,7 +1354,7 @@ out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
out_free_file:
allow_write_access(interpreter);
exe_file_allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_ph:

View File

@ -394,7 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
}
allow_write_access(interpreter);
exe_file_allow_write_access(interpreter);
fput(interpreter);
interpreter = NULL;
}
@ -467,7 +467,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
error:
if (interpreter) {
allow_write_access(interpreter);
exe_file_allow_write_access(interpreter);
fput(interpreter);
}
kfree(interpreter_name);

View File

@ -2544,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
/*
* Don't allow defrag on pre-content watched files, as it could
* populate the page cache with 0's via readahead.
*/
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
ret = -EINVAL;
goto out;
}
if (argp) {
if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;

View File

@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
sb->s_iflags |= SB_I_CGROUPWB;
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
err = super_setup_bdi(sb);
if (err) {

View File

@ -912,7 +912,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
path_noexec(&file->f_path))
return ERR_PTR(-EACCES);
err = deny_write_access(file);
err = exe_file_deny_write_access(file);
if (err)
return ERR_PTR(err);
@ -927,7 +927,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
* Returns ERR_PTR on failure or allocated struct file on success.
*
* As this is a wrapper for the internal do_open_execat(), callers
* must call allow_write_access() before fput() on release. Also see
* must call exe_file_allow_write_access() before fput() on release. Also see
* do_close_execat().
*/
struct file *open_exec(const char *name)
@ -1471,7 +1471,7 @@ static void do_close_execat(struct file *file)
{
if (!file)
return;
allow_write_access(file);
exe_file_allow_write_access(file);
fput(file);
}
@ -1797,7 +1797,7 @@ static int exec_binprm(struct linux_binprm *bprm)
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
allow_write_access(exec);
exe_file_allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);

View File

@ -756,6 +756,9 @@ retry:
return VM_FAULT_SIGBUS;
}
} else {
result = filemap_fsnotify_fault(vmf);
if (unlikely(result))
return result;
filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);

View File

@ -5312,6 +5312,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* i_version is always enabled now */
sb->s_flags |= SB_I_VERSION;
/* HSM events are allowed by default. */
sb->s_iflags |= SB_I_ALLOW_HSM;
err = ext4_check_feature_compatibility(sb, es, silent);
if (err)
goto failed_mount;

View File

@ -1158,10 +1158,10 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC | __FMODE_NONOTIFY));
__FMODE_EXEC));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0,

View File

@ -301,7 +301,6 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
*/
static int zisofs_read_folio(struct file *file, struct folio *folio)
{
struct page *page = &folio->page;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
int err;
@ -311,16 +310,15 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
PAGE_SHIFT <= zisofs_block_shift ?
(1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
struct page **pages;
pgoff_t index = page->index, end_index;
pgoff_t index = folio->index, end_index;
end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
/*
* If this page is wholly outside i_size we just return zero;
* If this folio is wholly outside i_size we just return zero;
* do_generic_file_read() will handle this for us
*/
if (index >= end_index) {
SetPageUptodate(page);
unlock_page(page);
folio_end_read(folio, true);
return 0;
}
@ -338,10 +336,10 @@ static int zisofs_read_folio(struct file *file, struct folio *folio)
pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1),
sizeof(*pages), GFP_KERNEL);
if (!pages) {
unlock_page(page);
folio_unlock(folio);
return -ENOMEM;
}
pages[full_page] = page;
pages[full_page] = &folio->page;
for (i = 0; i < pcount; i++, index++) {
if (i != full_page)

View File

@ -223,7 +223,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
int ret;
int ret, errno;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@ -262,14 +262,23 @@ static int fanotify_get_response(struct fsnotify_group *group,
ret = 0;
break;
case FAN_DENY:
/* Check custom errno from pre-content events */
errno = fanotify_get_response_errno(event->response);
if (errno) {
ret = -errno;
break;
}
fallthrough;
default:
ret = -EPERM;
}
/* Check if the response should be audited */
if (event->response & FAN_AUDIT)
audit_fanotify(event->response & ~FAN_AUDIT,
&event->audit_rule);
if (event->response & FAN_AUDIT) {
u32 response = event->response &
(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS);
audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule);
}
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
@ -548,9 +557,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
return &pevent->fae;
}
static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
int data_type,
gfp_t gfp)
{
const struct path *path = fsnotify_data_path(data, data_type);
const struct file_range *range =
fsnotify_data_file_range(data, data_type);
struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
@ -564,6 +577,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
pevent->hdr.len = 0;
pevent->state = FAN_EVENT_INIT;
pevent->path = *path;
/* NULL ppos means no range info */
pevent->ppos = range ? &range->pos : NULL;
pevent->count = range ? range->count : 0;
path_get(path);
return &pevent->fae;
@ -801,7 +817,7 @@ static struct fanotify_event *fanotify_alloc_event(
old_memcg = set_active_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
event = fanotify_alloc_perm_event(path, gfp);
event = fanotify_alloc_perm_event(data, data_type, gfp);
} else if (fanotify_is_error_event(mask)) {
event = fanotify_alloc_error_event(group, fsid, data,
data_type, &hash);
@ -909,8 +925,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
mask, data, data_type, dir);

View File

@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event)
struct fanotify_perm_event {
struct fanotify_event fae;
struct path path;
const loff_t *ppos; /* optional file range info */
size_t count;
u32 response; /* userspace answer to the event */
unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
@ -446,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask)
mask & FANOTIFY_PERM_EVENTS;
}
static inline bool fanotify_event_has_access_range(struct fanotify_event *event)
{
if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS))
return false;
return FANOTIFY_PERM(event)->ppos;
}
static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
return container_of(fse, struct fanotify_event, fse);
@ -518,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
return mflags;
}
static inline u32 fanotify_get_response_errno(int res)
{
return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK;
}

View File

@ -100,8 +100,7 @@ static void __init fanotify_sysctls_init(void)
*
* Internal and external open flags are stored together in field f_flags of
* struct file. Only external open flags shall be allowed in event_f_flags.
* Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
* excluded.
* Internal flags like FMODE_EXEC shall be excluded.
*/
#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
O_ACCMODE | O_APPEND | O_NONBLOCK | \
@ -118,10 +117,12 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
#define FANOTIFY_PIDFD_INFO_HDR_LEN \
#define FANOTIFY_PIDFD_INFO_LEN \
sizeof(struct fanotify_event_info_pidfd)
#define FANOTIFY_ERROR_INFO_LEN \
(sizeof(struct fanotify_event_info_error))
#define FANOTIFY_RANGE_INFO_LEN \
(sizeof(struct fanotify_event_info_range))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@ -159,9 +160,6 @@ static size_t fanotify_event_len(unsigned int info_mode,
int fh_len;
int dot_len = 0;
if (!info_mode)
return event_len;
if (fanotify_is_error_event(event->mask))
event_len += FANOTIFY_ERROR_INFO_LEN;
@ -176,14 +174,17 @@ static size_t fanotify_event_len(unsigned int info_mode,
dot_len = 1;
}
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
if (fanotify_event_has_object_fh(event)) {
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
}
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_LEN;
if (fanotify_event_has_access_range(event))
event_len += FANOTIFY_RANGE_INFO_LEN;
return event_len;
}
@ -258,11 +259,10 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
return client_fd;
/*
* we need a new file handle for the userspace program so it can read even if it was
* originally opened O_WRONLY.
* We provide an fd for the userspace program, so it could access the
* file without generating fanotify events itself.
*/
new_file = dentry_open(path,
group->fanotify_data.f_flags | __FMODE_NONOTIFY,
new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
current_cred());
if (IS_ERR(new_file)) {
put_unused_fd(client_fd);
@ -327,11 +327,12 @@ static int process_access_response(struct fsnotify_group *group,
struct fanotify_perm_event *event;
int fd = response_struct->fd;
u32 response = response_struct->response;
int errno = fanotify_get_response_errno(response);
int ret = info_len;
struct fanotify_response_info_audit_rule friar;
pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
group, fd, response, info, info_len);
pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
__func__, group, fd, response, errno, info, info_len);
/*
* make sure the response is valid, if invalid we do nothing and either
* userspace can send a valid response or we will clean it up after the
@ -342,7 +343,31 @@ static int process_access_response(struct fsnotify_group *group,
switch (response & FANOTIFY_RESPONSE_ACCESS) {
case FAN_ALLOW:
if (errno)
return -EINVAL;
break;
case FAN_DENY:
/* Custom errno is supported only for pre-content groups */
if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
return -EINVAL;
/*
* Limit errno to values expected on open(2)/read(2)/write(2)
* of regular files.
*/
switch (errno) {
case 0:
case EIO:
case EPERM:
case EBUSY:
case ETXTBSY:
case EAGAIN:
case ENOSPC:
case EDQUOT:
break;
default:
return -EINVAL;
}
break;
default:
return -EINVAL;
@ -506,7 +531,7 @@ static int copy_pidfd_info_to_user(int pidfd,
size_t count)
{
struct fanotify_event_info_pidfd info = { };
size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
if (WARN_ON_ONCE(info_len > count))
return -EFAULT;
@ -521,6 +546,30 @@ static int copy_pidfd_info_to_user(int pidfd,
return info_len;
}
static size_t copy_range_info_to_user(struct fanotify_event *event,
char __user *buf, int count)
{
struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
struct fanotify_event_info_range info = { };
size_t info_len = FANOTIFY_RANGE_INFO_LEN;
if (WARN_ON_ONCE(info_len > count))
return -EFAULT;
if (WARN_ON_ONCE(!pevent->ppos))
return -EINVAL;
info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
info.hdr.len = info_len;
info.offset = *(pevent->ppos);
info.count = pevent->count;
if (copy_to_user(buf, &info, info_len))
return -EFAULT;
return info_len;
}
static int copy_info_records_to_user(struct fanotify_event *event,
struct fanotify_info *info,
unsigned int info_mode, int pidfd,
@ -642,6 +691,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
if (fanotify_event_has_access_range(event)) {
ret = copy_range_info_to_user(event, buf, count);
if (ret < 0)
return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
return total_bytes;
}
@ -756,12 +814,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
buf += FAN_EVENT_METADATA_LEN;
count -= FAN_EVENT_METADATA_LEN;
if (info_mode) {
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
buf, count);
if (ret < 0)
goto out_close_fd;
}
if (f)
fd_install(fd, f);
@ -1294,7 +1350,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
}
static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
unsigned int fan_flags)
__u32 mask, unsigned int fan_flags)
{
/*
* Non evictable mark cannot be downgraded to evictable mark.
@ -1321,6 +1377,11 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
return -EEXIST;
/* For now pre-content events are not generated for directories */
mask |= fsn_mark->mask;
if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
return -EEXIST;
return 0;
}
@ -1347,7 +1408,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
/*
* Check if requested mark flags conflict with an existing mark flags.
*/
ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
if (ret)
goto out;
@ -1409,6 +1470,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
unsigned int internal_flags = 0;
struct file *file;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@ -1477,7 +1539,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
return -EINVAL;
f_flags = O_RDWR | __FMODE_NONOTIFY;
f_flags = O_RDWR;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
if (flags & FAN_NONBLOCK)
@ -1555,10 +1617,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
fd = get_unused_fd_flags(f_flags);
if (fd < 0)
goto out_destroy_group;
file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
f_flags, FMODE_NONOTIFY);
if (IS_ERR(file)) {
put_unused_fd(fd);
fd = PTR_ERR(file);
goto out_destroy_group;
}
fd_install(fd, file);
return fd;
out_destroy_group:
@ -1638,11 +1708,23 @@ static int fanotify_events_supported(struct fsnotify_group *group,
unsigned int flags)
{
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
bool is_dir = d_is_dir(path->dentry);
/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
(mask & FAN_RENAME) ||
(flags & FAN_MARK_IGNORE);
/*
* Filesystems need to opt-into pre-content evnets (a.k.a HSM)
* and they are only supported on regular files and directories.
*/
if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
return -EOPNOTSUPP;
if (!is_dir && !d_is_reg(path->dentry))
return -EINVAL;
}
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
@ -1675,7 +1757,7 @@ static int fanotify_events_supported(struct fsnotify_group *group,
* but because we always allowed it, error only when using new APIs.
*/
if (strict_dir_events && mark_type == FAN_MARK_INODE &&
!d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
!is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
return -ENOTDIR;
return 0;
@ -1776,10 +1858,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EPERM;
/*
* Permission events require minimum priority FAN_CLASS_CONTENT.
* Permission events are not allowed for FAN_CLASS_NOTIF.
* Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
*/
if (mask & FANOTIFY_PERM_EVENTS &&
group->priority < FSNOTIFY_PRIO_CONTENT)
group->priority == FSNOTIFY_PRIO_NORMAL)
return -EINVAL;
else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
group->priority == FSNOTIFY_PRIO_CONTENT)
return -EINVAL;
if (mask & FAN_FS_ERROR &&
@ -1814,6 +1900,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
return -EINVAL;
/* Pre-content events are not currently generated for directories. */
if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);

View File

@ -193,7 +193,7 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
return mask & marks_mask;
}
/* Are there any inode/mount/sb objects that are interested in this event? */
/* Are there any inode/mount/sb objects that watch for these events? */
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
__u32 mask)
{
@ -203,6 +203,24 @@ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}
/* Report pre-content event with optional range info */
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
size_t count)
{
struct file_range range;
/* Report page aligned range only when pos is known */
if (!ppos)
return fsnotify_path(path, FS_PRE_ACCESS);
range.path = path;
range.pos = PAGE_ALIGN_DOWN(*ppos);
range.count = PAGE_ALIGN(*ppos + count) - range.pos;
return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
FSNOTIFY_EVENT_FILE_RANGE);
}
/*
* Notify this dentry's parent about a child's events with child name info
* if parent is watching or if inode/sb/mount are interested in events with
@ -623,11 +641,72 @@ out:
}
EXPORT_SYMBOL_GPL(fsnotify);
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
* At open time we check fsnotify_sb_has_priority_watchers() and set the
* FMODE_NONOTIFY_ mode bits accordignly.
* Later, fsnotify permission hooks do not check if there are permission event
* watches, but that there were permission event watches at open time.
*/
void file_set_fsnotify_mode(struct file *file)
{
struct dentry *dentry = file->f_path.dentry, *parent;
struct super_block *sb = dentry->d_sb;
__u32 mnt_mask, p_mask;
/* Is it a file opened by fanotify? */
if (FMODE_FSNOTIFY_NONE(file->f_mode))
return;
/*
* Permission events is a super set of pre-content events, so if there
* are no permission event watchers, there are also no pre-content event
* watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
*/
if (likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_CONTENT))) {
file->f_mode |= FMODE_NONOTIFY_PERM;
return;
}
/*
* If there are permission event watchers but no pre-content event
* watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
*/
if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_PRE_CONTENT))) {
file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
return;
}
/*
* OK, there are some pre-content watchers. Check if anybody is
* watching for pre-content events on *this* file.
*/
mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
FSNOTIFY_PRE_CONTENT_EVENTS)))
return;
/* Is parent watching for pre-content events on this file? */
if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
parent = dget_parent(dentry);
p_mask = fsnotify_inode_watches_children(d_inode(parent));
dput(parent);
if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
return;
}
/* Nobody watching for pre-content events from this file */
file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
}
#endif
static __init int fsnotify_init(void)
{
int ret;
BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)

View File

@ -121,7 +121,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
event->sync_cookie = cookie;
event->name_len = len;
if (len)
strcpy(event->name, name->name);
strscpy(event->name, name->name, event->name_len + 1);
ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {

View File

@ -81,14 +81,18 @@ long vfs_truncate(const struct path *path, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EINVAL;
error = mnt_want_write(path->mnt);
if (error)
goto out;
idmap = mnt_idmap(path->mnt);
error = inode_permission(idmap, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
return error;
error = fsnotify_truncate_perm(path, length);
if (error)
return error;
error = mnt_want_write(path->mnt);
if (error)
return error;
error = -EPERM;
if (IS_APPEND(inode))
@ -114,7 +118,7 @@ put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
out:
return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);
@ -175,9 +179,16 @@ long do_ftruncate(struct file *file, loff_t length, int small)
/* Check IS_APPEND on real upper inode */
if (IS_APPEND(file_inode(file)))
return -EPERM;
sb_start_write(inode->i_sb);
error = security_file_truncate(file);
if (!error)
if (error)
return error;
error = fsnotify_truncate_perm(&file->f_path, length);
if (error)
return error;
sb_start_write(inode->i_sb);
error = do_truncate(file_mnt_idmap(file), dentry, length,
ATTR_MTIME | ATTR_CTIME, file);
sb_end_write(inode->i_sb);
@ -901,7 +912,7 @@ static int do_dentry_open(struct file *f,
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH | FMODE_OPENED;
f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
f->f_op = &empty_fops;
return 0;
}
@ -929,6 +940,12 @@ static int do_dentry_open(struct file *f,
if (error)
goto cleanup_all;
/*
* Set FMODE_NONOTIFY_* bits according to existing permission watches.
* If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
* change anything.
*/
file_set_fsnotify_mode(f);
error = fsnotify_open_perm(f);
if (error)
goto cleanup_all;
@ -1105,6 +1122,23 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
struct file *dentry_open_nonotify(const struct path *path, int flags,
const struct cred *cred)
{
struct file *f = alloc_empty_file(flags, cred);
if (!IS_ERR(f)) {
int error;
f->f_mode |= FMODE_NONOTIFY;
error = vfs_open(path, f);
if (error) {
fput(f);
f = ERR_PTR(error);
}
}
return f;
}
/**
* dentry_create - Create and open a file
* @path: path to create
@ -1202,7 +1236,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
u64 flags = how->flags;
u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
u64 strip = O_CLOEXEC;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
@ -1210,9 +1244,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
"struct open_flags doesn't yet handle flags > 32 bits");
/*
* Strip flags that either shouldn't be set by userspace like
* FMODE_NONOTIFY or that aren't relevant in determining struct
* open_flags like O_CLOEXEC.
* Strip flags that aren't relevant in determining struct open_flags.
*/
flags &= ~strip;

View File

@ -1451,6 +1451,9 @@ xfs_dax_read_fault(
trace_xfs_read_fault(ip, order);
ret = filemap_fsnotify_fault(vmf);
if (unlikely(ret))
return ret;
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault_locked(vmf, order, false);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
@ -1479,6 +1482,16 @@ xfs_write_fault(
vm_fault_t ret;
trace_xfs_write_fault(ip, order);
/*
* Usually we get here from ->page_mkwrite callback but in case of DAX
* we will get here also for ordinary write fault. Handle HSM
* notifications for that case.
*/
if (IS_DAX(inode)) {
ret = filemap_fsnotify_fault(vmf);
if (unlikely(ret))
return ret;
}
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);

View File

@ -1730,7 +1730,7 @@ xfs_fs_fill_super(
sb->s_time_max = XFS_LEGACY_TIME_MAX;
}
trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
sb->s_iflags |= SB_I_CGROUPWB;
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
set_posix_acl_flag(sb);

View File

@ -89,6 +89,16 @@
#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \
FAN_RENAME)
/* Content events can be used to inspect file content */
#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \
FAN_ACCESS_PERM)
/* Pre-content events can be used to fill file content */
#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS)
/* Events that require a permission response from user */
#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \
FANOTIFY_PRE_CONTENT_EVENTS)
/* Events that can be reported with event->fd */
#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
@ -104,10 +114,6 @@
FANOTIFY_INODE_EVENTS | \
FANOTIFY_ERROR_EVENTS)
/* Events that require a permission response from user */
#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
FAN_OPEN_EXEC_PERM)
/* Extra flags that may be reported with event or control handling of events */
#define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR)
@ -126,7 +132,9 @@
/* These masks check for invalid bits in permission responses. */
#define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY)
#define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO)
#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS)
#define FANOTIFY_RESPONSE_VALID_MASK \
(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \
(FAN_ERRNO_MASK << FAN_ERRNO_SHIFT))
/* Do not use these old uapi constants internally */
#undef FAN_ALL_CLASS_BITS

View File

@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_NOREUSE ((__force fmode_t)(1 << 23))
/* FMODE_* bit 24 */
/* File is embedded in backing_file object */
#define FMODE_BACKING ((__force fmode_t)(1 << 25))
#define FMODE_BACKING ((__force fmode_t)(1 << 24))
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26))
/*
* Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
* generated (see below)
*/
#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25))
/*
* Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
* generated (see below)
*/
#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26))
/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
@ -190,6 +197,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
/*
* The two FMODE_NONOTIFY* define which fsnotify events should not be generated
* for a file. These are the possible values of (f->f_mode &
* FMODE_FSNOTIFY_MASK) and their meaning:
*
* FMODE_NONOTIFY - suppress all (incl. non-permission) events.
* FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
* FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
*/
#define FMODE_FSNOTIFY_MASK \
(FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
#define FMODE_FSNOTIFY_NONE(mode) \
((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
#define FMODE_FSNOTIFY_PERM(mode) \
((mode & FMODE_FSNOTIFY_MASK) == 0 || \
(mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
#define FMODE_FSNOTIFY_HSM(mode) \
((mode & FMODE_FSNOTIFY_MASK) == 0)
#else
#define FMODE_FSNOTIFY_PERM(mode) 0
#define FMODE_FSNOTIFY_HSM(mode) 0
#endif
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
@ -1232,6 +1265,7 @@ extern int send_sigurg(struct file *file);
#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
/* Possible states of 'frozen' field */
enum {
@ -2751,6 +2785,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
}
struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_open_nonotify(const struct path *path, int flags,
const struct cred *cred);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
struct path *backing_file_user_path(struct file *f);
@ -3059,6 +3095,28 @@ static inline void allow_write_access(struct file *file)
if (file)
atomic_inc(&file_inode(file)->i_writecount);
}
/*
* Do not prevent write to executable file when watched by pre-content events.
*
* Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
* the time of file open and remains constant for entire lifetime of the file,
* so if pre-content watches are added post execution or removed before the end
* of the execution, it will not cause i_writecount reference leak.
*/
static inline int exe_file_deny_write_access(struct file *exe_file)
{
if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
return 0;
return deny_write_access(exe_file);
}
static inline void exe_file_allow_write_access(struct file *exe_file)
{
if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
return;
allow_write_access(exe_file);
}
static inline bool inode_is_open_for_write(const struct inode *inode)
{
return atomic_read(&inode->i_writecount) > 0;
@ -3707,11 +3765,9 @@ struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);
#define __FMODE_EXEC ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY)
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
(flag & __FMODE_NONOTIFY)))
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
static inline bool is_sxid(umode_t mode)
{

View File

@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}
static inline int fsnotify_path(const struct path *path, __u32 mask)
{
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}
static inline int fsnotify_file(struct file *file, __u32 mask)
{
const struct path *path;
/*
* FMODE_NONOTIFY are fds generated by fanotify itself which should not
* generate new events. We also don't want to generate events for
* FMODE_PATH fds (involves open & close events) as they are just
* handle creation / destruction events and not "real" file events.
*/
if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH))
if (FMODE_FSNOTIFY_NONE(file->f_mode))
return 0;
path = &file->f_path;
/* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
!fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
FSNOTIFY_PRIO_CONTENT))
return 0;
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
return fsnotify_path(&file->f_path, mask);
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
void file_set_fsnotify_mode(struct file *file);
/*
* fsnotify_file_area_perm - permission hook before access to file range
*/
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
const loff_t *ppos, size_t count)
{
__u32 fsnotify_mask = FS_ACCESS_PERM;
/*
* filesystem may be modified in the context of permission events
* (e.g. by HSM filling a file on access), so sb freeze protection
@ -147,14 +144,49 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
*/
lockdep_assert_once(file_write_not_started(file));
if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
return 0;
if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
return 0;
/*
* read()/write() and other types of access generate pre-content events.
*/
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
int ret = fsnotify_pre_content(&file->f_path, ppos, count);
if (ret)
return ret;
}
if (!(perm_mask & MAY_READ))
return 0;
return fsnotify_file(file, fsnotify_mask);
/*
* read() also generates the legacy FS_ACCESS_PERM event, so content
* scanners can inspect the content filled by pre-content event.
*/
return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
}
/*
* fsnotify_file_perm - permission hook before file access
* fsnotify_truncate_perm - permission hook before file truncate
*/
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
struct inode *inode = d_inode(path->dentry);
if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
!fsnotify_sb_has_priority_watchers(inode->i_sb,
FSNOTIFY_PRIO_PRE_CONTENT))
return 0;
return fsnotify_pre_content(path, &length, 0);
}
/*
* fsnotify_file_perm - permission hook before file access (unknown range)
*/
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
@ -168,22 +200,34 @@ static inline int fsnotify_open_perm(struct file *file)
{
int ret;
if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
return 0;
if (file->f_flags & __FMODE_EXEC) {
ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
if (ret)
return ret;
}
return fsnotify_file(file, FS_OPEN_PERM);
return fsnotify_path(&file->f_path, FS_OPEN_PERM);
}
#else
static inline void file_set_fsnotify_mode(struct file *file)
{
}
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
const loff_t *ppos, size_t count)
{
return 0;
}
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
return 0;
}
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
return 0;

View File

@ -55,6 +55,9 @@
#define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
#define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */
/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */
/*
* Set on inode mark that cares about things that happen to its children.
@ -77,8 +80,14 @@
*/
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
FS_OPEN_EXEC_PERM)
/* Content events can be used to inspect file content */
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
FS_ACCESS_PERM)
/* Pre-content events can be used to fill file content */
#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS)
#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
FSNOTIFY_PRE_CONTENT_EVENTS)
/*
* This is a list of all events that may get sent to a parent that is watching
@ -285,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
FSNOTIFY_EVENT_NONE,
FSNOTIFY_EVENT_FILE_RANGE,
FSNOTIFY_EVENT_PATH,
FSNOTIFY_EVENT_INODE,
FSNOTIFY_EVENT_DENTRY,
@ -297,6 +307,17 @@ struct fs_error_report {
struct super_block *sb;
};
struct file_range {
const struct path *path;
loff_t pos;
size_t count;
};
static inline const struct path *file_range_path(const struct file_range *range)
{
return range->path;
}
static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
switch (data_type) {
@ -306,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
return d_inode(data);
case FSNOTIFY_EVENT_PATH:
return d_inode(((const struct path *)data)->dentry);
case FSNOTIFY_EVENT_FILE_RANGE:
return d_inode(file_range_path(data)->dentry);
case FSNOTIFY_EVENT_ERROR:
return ((struct fs_error_report *)data)->inode;
default:
@ -321,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
return (struct dentry *)data;
case FSNOTIFY_EVENT_PATH:
return ((const struct path *)data)->dentry;
case FSNOTIFY_EVENT_FILE_RANGE:
return file_range_path(data)->dentry;
default:
return NULL;
}
@ -332,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data,
switch (data_type) {
case FSNOTIFY_EVENT_PATH:
return data;
case FSNOTIFY_EVENT_FILE_RANGE:
return file_range_path(data);
default:
return NULL;
}
@ -347,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
return ((struct dentry *)data)->d_sb;
case FSNOTIFY_EVENT_PATH:
return ((const struct path *)data)->dentry->d_sb;
case FSNOTIFY_EVENT_FILE_RANGE:
return file_range_path(data)->dentry->d_sb;
case FSNOTIFY_EVENT_ERROR:
return ((struct fs_error_report *) data)->sb;
default:
@ -366,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report(
}
}
static inline const struct file_range *fsnotify_data_file_range(
const void *data,
int data_type)
{
switch (data_type) {
case FSNOTIFY_EVENT_FILE_RANGE:
return (struct file_range *)data;
default:
return NULL;
}
}
/*
* Index to merged marks iterator array that correlates to a type of watch.
* The type of watched object can be deduced from the iterator type, but not
@ -854,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
{
INIT_LIST_HEAD(&event->list);
}
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
size_t count);
#else
static inline int fsnotify_pre_content(const struct path *path,
const loff_t *ppos, size_t count)
{
return 0;
}
static inline int fsnotify(__u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *name,
struct inode *inode, u32 cookie)

View File

@ -3431,6 +3431,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf);
extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */

View File

@ -6,7 +6,6 @@
/*
* FMODE_EXEC is 0x20
* FMODE_NONOTIFY is 0x4000000
* These cannot be used by userspace O_* until internal and external open
* flags are split.
* -Eric Paris

View File

@ -25,6 +25,9 @@
#define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
#define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
#define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */
/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */
#define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */
@ -143,6 +146,7 @@ struct fanotify_event_metadata {
#define FAN_EVENT_INFO_TYPE_DFID 3
#define FAN_EVENT_INFO_TYPE_PIDFD 4
#define FAN_EVENT_INFO_TYPE_ERROR 5
#define FAN_EVENT_INFO_TYPE_RANGE 6
/* Special info types for FAN_RENAME */
#define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10
@ -189,6 +193,13 @@ struct fanotify_event_info_error {
__u32 error_count;
};
struct fanotify_event_info_range {
struct fanotify_event_info_header hdr;
__u32 pad;
__u64 offset;
__u64 count;
};
/*
* User space may need to record additional information about its decision.
* The extra information type records what kind of information is included.
@ -224,6 +235,13 @@ struct fanotify_response_info_audit_rule {
/* Legit userspace responses to a _PERM event */
#define FAN_ALLOW 0x01
#define FAN_DENY 0x02
/* errno other than EPERM can specified in upper byte of deny response */
#define FAN_ERRNO_BITS 8
#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS)
#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1)
#define FAN_DENY_ERRNO(err) \
(FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT))
#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */
#define FAN_INFO 0x20 /* Bitmask to indicate additional information */

View File

@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
if (exe_file && deny_write_access(exe_file))
pr_warn_once("deny_write_access() failed in %s\n", __func__);
if (exe_file && exe_file_deny_write_access(exe_file))
pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
@ -1419,13 +1419,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
if (unlikely(deny_write_access(new_exe_file)))
if (unlikely(exe_file_deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
allow_write_access(old_exe_file);
exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@ -1466,7 +1466,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
ret = deny_write_access(new_exe_file);
ret = exe_file_deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
@ -1478,7 +1478,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
mmap_write_unlock(mm);
if (old_exe_file) {
allow_write_access(old_exe_file);
exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;

View File

@ -47,6 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@ -3141,6 +3142,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
/*
* If we have pre-content watches we need to disable readahead to make
* sure that we don't populate our mapping with 0 filled pages that we
* never emitted an event for.
*/
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
return fpin;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@ -3209,6 +3218,10 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
/* See comment in do_sync_mmap_readahead. */
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
return fpin;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@ -3267,6 +3280,48 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
return ret;
}
/**
* filemap_fsnotify_fault - maybe emit a pre-content event.
* @vmf: struct vm_fault containing details of the fault.
*
* If we have a pre-content watch on this file we will emit an event for this
* range. If we return anything the fault caller should return immediately, we
* will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
* fault again and then the fault handler will run the second time through.
*
* Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
*/
vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
{
struct file *fpin = NULL;
int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
loff_t pos = vmf->pgoff >> PAGE_SHIFT;
size_t count = PAGE_SIZE;
int err;
/*
* We already did this and now we're retrying with everything locked,
* don't emit the event and continue.
*/
if (vmf->flags & FAULT_FLAG_TRIED)
return 0;
/* No watches, we're done. */
if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
return 0;
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
if (!fpin)
return VM_FAULT_SIGBUS;
err = fsnotify_file_area_perm(fpin, mask, &pos, count);
fput(fpin);
if (err)
return VM_FAULT_SIGBUS;
return VM_FAULT_RETRY;
}
EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@ -3370,6 +3425,37 @@ retry_find:
* or because readahead was otherwise unable to retrieve it.
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
* If this is a precontent file we have can now emit an event to
* try and populate the folio.
*/
if (!(vmf->flags & FAULT_FLAG_TRIED) &&
unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
loff_t pos = folio_pos(folio);
size_t count = folio_size(folio);
/* We're NOWAIT, we have to retry. */
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
folio_unlock(folio);
goto out_retry;
}
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
mapping_locked = false;
folio_unlock(folio);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
if (!fpin)
goto out_retry;
error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
count);
if (error)
ret = VM_FAULT_SIGBUS;
goto out_retry;
}
/*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we

View File

@ -76,6 +76,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@ -5662,8 +5663,17 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
/*
* Currently we just emit PAGE_SIZE for our fault events, so don't allow
* a huge fault if we have a pre content watch on this file. This would
* be trivial to support, but there would need to be tests to ensure
* this works properly and those don't exist currently.
*/
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@ -5687,6 +5697,9 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/* See comment in create_huge_pmd. */
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@ -5709,6 +5722,9 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
/* See comment in create_huge_pmd. */
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@ -5726,6 +5742,9 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/* See comment in create_huge_pmd. */
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))

View File

@ -1613,6 +1613,13 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
{
BUG();
return 0;
}
EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();

View File

@ -128,6 +128,7 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include "internal.h"
@ -548,6 +549,15 @@ void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long max_pages, contig_count;
pgoff_t prev_index, miss;
/*
* If we have pre-content watches we need to disable readahead to make
* sure that we don't find 0 filled pages in cache that we never emitted
* events for. Filesystems supporting HSM must make sure to not call
* this function with ractl->file unset for files handled by HSM.
*/
if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
return;
/*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
@ -626,6 +636,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
/* See the comment in page_cache_sync_ra. */
if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
return;
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/

View File

@ -3404,7 +3404,8 @@ static int selinux_path_notify(const struct path *path, u64 mask,
perm |= FILE__WATCH_WITH_PERM;
/* watches on read-like events need the file:watch_reads permission */
if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
FS_CLOSE_NOWRITE))
perm |= FILE__WATCH_READS;
return path_has_perm(current_cred(), path, perm);