mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-20 04:19:41 +00:00
fe9da61ffc
Commit 16d7fd3cfa72 ("zonefs: use iomap for synchronous direct writes") changes zonefs code from a self-built zone append BIO to using iomap for synchronous direct writes. This change relies on iomap submit BIO callback to change the write BIO built by iomap to a zone append BIO. However, this change overlooked the fact that a write BIO may be very large as it is split when issued. The change from a regular write to a zone append operation for the built BIO can result in a block layer warning as zone append BIO are not allowed to be split. WARNING: CPU: 18 PID: 202210 at block/bio.c:1644 bio_split+0x288/0x350 Call Trace: ? __warn+0xc9/0x2b0 ? bio_split+0x288/0x350 ? report_bug+0x2e6/0x390 ? handle_bug+0x41/0x80 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? bio_split+0x288/0x350 bio_split_rw+0x4bc/0x810 ? __pfx_bio_split_rw+0x10/0x10 ? lockdep_unlock+0xf2/0x250 __bio_split_to_limits+0x1d8/0x900 blk_mq_submit_bio+0x1cf/0x18a0 ? __pfx_iov_iter_extract_pages+0x10/0x10 ? __pfx_blk_mq_submit_bio+0x10/0x10 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? mark_held_locks+0x9e/0xe0 __submit_bio+0x1ea/0x290 ? __pfx___submit_bio+0x10/0x10 ? seqcount_lockdep_reader_access.constprop.0+0x82/0x90 submit_bio_noacct_nocheck+0x675/0xa20 ? __pfx_bio_iov_iter_get_pages+0x10/0x10 ? __pfx_submit_bio_noacct_nocheck+0x10/0x10 iomap_dio_bio_iter+0x624/0x1280 __iomap_dio_rw+0xa22/0x18a0 ? lock_is_held_type+0xe3/0x140 ? __pfx___iomap_dio_rw+0x10/0x10 ? lock_release+0x362/0x620 ? zonefs_file_write_iter+0x74c/0xc80 [zonefs] ? down_write+0x13d/0x1e0 iomap_dio_rw+0xe/0x40 zonefs_file_write_iter+0x5ea/0xc80 [zonefs] do_iter_readv_writev+0x18b/0x2c0 ? __pfx_do_iter_readv_writev+0x10/0x10 ? inode_security+0x54/0xf0 do_iter_write+0x13b/0x7c0 ? lock_is_held_type+0xe3/0x140 vfs_writev+0x185/0x550 ? __pfx_vfs_writev+0x10/0x10 ? __handle_mm_fault+0x9bd/0x1c90 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? __up_read+0x1ea/0x720 ? do_pwritev+0x136/0x1f0 do_pwritev+0x136/0x1f0 ? __pfx_do_pwritev+0x10/0x10 ? syscall_enter_from_user_mode+0x22/0x90 ? lockdep_hardirqs_on+0x7d/0x100 do_syscall_64+0x58/0x80 This error depends on the hardware used, specifically on the max zone append bytes and max_[hw_]sectors limits. Tests using AMD Epyc machines that have low limits did not reveal this issue while runs on Intel Xeon machines with larger limits trigger it. Manually splitting the zone append BIO using bio_split_rw() can solve this issue but also requires issuing the fragment BIOs synchronously with submit_bio_wait(), to avoid potential reordering of the zone append BIO fragments, which would lead to data corruption. That is, this solution is not better than using regular write BIOs which are subject to serialization using zone write locking at the IO scheduler level. Given this, fix the issue by removing zone append support and using regular write BIOs for synchronous direct writes. This allows preseving the use of iomap and having identical synchronous and asynchronous sequential file write path. Zone append support will be reintroduced later through io_uring commands to ensure that the needed special handling is done correctly. Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Fixes: 16d7fd3cfa72 ("zonefs: use iomap for synchronous direct writes") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal <dlemoal@kernel.org> Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
290 lines
7.3 KiB
C
290 lines
7.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Simple zone file system for zoned block devices.
|
|
*
|
|
* Copyright (C) 2019 Western Digital Corporation or its affiliates.
|
|
*/
|
|
#ifndef __ZONEFS_H__
|
|
#define __ZONEFS_H__
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/magic.h>
|
|
#include <linux/uuid.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/kobject.h>
|
|
|
|
/*
|
|
* Maximum length of file names: this only needs to be large enough to fit
|
|
* the zone group directory names and a decimal zone number for file names.
|
|
* 16 characters is plenty.
|
|
*/
|
|
#define ZONEFS_NAME_MAX 16
|
|
|
|
/*
|
|
* Zone types: ZONEFS_ZTYPE_SEQ is used for all sequential zone types
|
|
* defined in linux/blkzoned.h, that is, BLK_ZONE_TYPE_SEQWRITE_REQ and
|
|
* BLK_ZONE_TYPE_SEQWRITE_PREF.
|
|
*/
|
|
enum zonefs_ztype {
|
|
ZONEFS_ZTYPE_CNV,
|
|
ZONEFS_ZTYPE_SEQ,
|
|
ZONEFS_ZTYPE_MAX,
|
|
};
|
|
|
|
static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
|
|
{
|
|
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
|
|
return ZONEFS_ZTYPE_CNV;
|
|
return ZONEFS_ZTYPE_SEQ;
|
|
}
|
|
|
|
#define ZONEFS_ZONE_INIT_MODE (1U << 0)
|
|
#define ZONEFS_ZONE_OPEN (1U << 1)
|
|
#define ZONEFS_ZONE_ACTIVE (1U << 2)
|
|
#define ZONEFS_ZONE_OFFLINE (1U << 3)
|
|
#define ZONEFS_ZONE_READONLY (1U << 4)
|
|
#define ZONEFS_ZONE_CNV (1U << 31)
|
|
|
|
/*
|
|
* In-memory per-file inode zone data.
|
|
*/
|
|
struct zonefs_zone {
|
|
/* Zone state flags */
|
|
unsigned int z_flags;
|
|
|
|
/* Zone start sector (512B unit) */
|
|
sector_t z_sector;
|
|
|
|
/* Zone size (bytes) */
|
|
loff_t z_size;
|
|
|
|
/* Zone capacity (file maximum size, bytes) */
|
|
loff_t z_capacity;
|
|
|
|
/* Write pointer offset in the zone (sequential zones only, bytes) */
|
|
loff_t z_wpoffset;
|
|
|
|
/* Saved inode uid, gid and access rights */
|
|
umode_t z_mode;
|
|
kuid_t z_uid;
|
|
kgid_t z_gid;
|
|
};
|
|
|
|
/*
|
|
* In memory zone group information: all zones of a group are exposed
|
|
* as files, one file per zone.
|
|
*/
|
|
struct zonefs_zone_group {
|
|
struct inode *g_inode;
|
|
unsigned int g_nr_zones;
|
|
struct zonefs_zone *g_zones;
|
|
};
|
|
|
|
/*
|
|
* In-memory inode data.
|
|
*/
|
|
struct zonefs_inode_info {
|
|
struct inode i_vnode;
|
|
|
|
/*
|
|
* To serialise fully against both syscall and mmap based IO and
|
|
* sequential file truncation, two locks are used. For serializing
|
|
* zonefs_seq_file_truncate() against zonefs_iomap_begin(), that is,
|
|
* file truncate operations against block mapping, i_truncate_mutex is
|
|
* used. i_truncate_mutex also protects against concurrent accesses
|
|
* and changes to the inode private data, and in particular changes to
|
|
* a sequential file size on completion of direct IO writes.
|
|
* Serialization of mmap read IOs with truncate and syscall IO
|
|
* operations is done with invalidate_lock in addition to
|
|
* i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock
|
|
* (invalidate_lock first, i_truncate_mutex second).
|
|
*/
|
|
struct mutex i_truncate_mutex;
|
|
|
|
/* guarded by i_truncate_mutex */
|
|
unsigned int i_wr_refcnt;
|
|
};
|
|
|
|
static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
|
|
{
|
|
return container_of(inode, struct zonefs_inode_info, i_vnode);
|
|
}
|
|
|
|
static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z)
|
|
{
|
|
return z->z_flags & ZONEFS_ZONE_CNV;
|
|
}
|
|
|
|
static inline bool zonefs_zone_is_seq(struct zonefs_zone *z)
|
|
{
|
|
return !zonefs_zone_is_cnv(z);
|
|
}
|
|
|
|
static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode)
|
|
{
|
|
return inode->i_private;
|
|
}
|
|
|
|
static inline bool zonefs_inode_is_cnv(struct inode *inode)
|
|
{
|
|
return zonefs_zone_is_cnv(zonefs_inode_zone(inode));
|
|
}
|
|
|
|
static inline bool zonefs_inode_is_seq(struct inode *inode)
|
|
{
|
|
return zonefs_zone_is_seq(zonefs_inode_zone(inode));
|
|
}
|
|
|
|
/*
|
|
* On-disk super block (block 0).
|
|
*/
|
|
#define ZONEFS_LABEL_LEN 64
|
|
#define ZONEFS_UUID_SIZE 16
|
|
#define ZONEFS_SUPER_SIZE 4096
|
|
|
|
struct zonefs_super {
|
|
|
|
/* Magic number */
|
|
__le32 s_magic;
|
|
|
|
/* Checksum */
|
|
__le32 s_crc;
|
|
|
|
/* Volume label */
|
|
char s_label[ZONEFS_LABEL_LEN];
|
|
|
|
/* 128-bit uuid */
|
|
__u8 s_uuid[ZONEFS_UUID_SIZE];
|
|
|
|
/* Features */
|
|
__le64 s_features;
|
|
|
|
/* UID/GID to use for files */
|
|
__le32 s_uid;
|
|
__le32 s_gid;
|
|
|
|
/* File permissions */
|
|
__le32 s_perm;
|
|
|
|
/* Padding to ZONEFS_SUPER_SIZE bytes */
|
|
__u8 s_reserved[3988];
|
|
|
|
} __packed;
|
|
|
|
/*
|
|
* Feature flags: specified in the s_features field of the on-disk super
|
|
* block struct zonefs_super and in-memory in the s_feartures field of
|
|
* struct zonefs_sb_info.
|
|
*/
|
|
enum zonefs_features {
|
|
/*
|
|
* Aggregate contiguous conventional zones into a single file.
|
|
*/
|
|
ZONEFS_F_AGGRCNV = 1ULL << 0,
|
|
/*
|
|
* Use super block specified UID for files instead of default 0.
|
|
*/
|
|
ZONEFS_F_UID = 1ULL << 1,
|
|
/*
|
|
* Use super block specified GID for files instead of default 0.
|
|
*/
|
|
ZONEFS_F_GID = 1ULL << 2,
|
|
/*
|
|
* Use super block specified file permissions instead of default 640.
|
|
*/
|
|
ZONEFS_F_PERM = 1ULL << 3,
|
|
};
|
|
|
|
#define ZONEFS_F_DEFINED_FEATURES \
|
|
(ZONEFS_F_AGGRCNV | ZONEFS_F_UID | ZONEFS_F_GID | ZONEFS_F_PERM)
|
|
|
|
/*
|
|
* Mount options for zone write pointer error handling.
|
|
*/
|
|
#define ZONEFS_MNTOPT_ERRORS_RO (1 << 0) /* Make zone file readonly */
|
|
#define ZONEFS_MNTOPT_ERRORS_ZRO (1 << 1) /* Make zone file offline */
|
|
#define ZONEFS_MNTOPT_ERRORS_ZOL (1 << 2) /* Make zone file offline */
|
|
#define ZONEFS_MNTOPT_ERRORS_REPAIR (1 << 3) /* Remount read-only */
|
|
#define ZONEFS_MNTOPT_ERRORS_MASK \
|
|
(ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \
|
|
ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR)
|
|
#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */
|
|
|
|
/*
|
|
* In-memory Super block information.
|
|
*/
|
|
struct zonefs_sb_info {
|
|
|
|
unsigned long s_mount_opts;
|
|
|
|
spinlock_t s_lock;
|
|
|
|
unsigned long long s_features;
|
|
kuid_t s_uid;
|
|
kgid_t s_gid;
|
|
umode_t s_perm;
|
|
uuid_t s_uuid;
|
|
unsigned int s_zone_sectors_shift;
|
|
|
|
struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX];
|
|
|
|
loff_t s_blocks;
|
|
loff_t s_used_blocks;
|
|
|
|
unsigned int s_max_wro_seq_files;
|
|
atomic_t s_wro_seq_files;
|
|
|
|
unsigned int s_max_active_seq_files;
|
|
atomic_t s_active_seq_files;
|
|
|
|
bool s_sysfs_registered;
|
|
struct kobject s_kobj;
|
|
struct completion s_kobj_unregister;
|
|
};
|
|
|
|
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
|
|
{
|
|
return sb->s_fs_info;
|
|
}
|
|
|
|
#define zonefs_info(sb, format, args...) \
|
|
pr_info("zonefs (%s): " format, sb->s_id, ## args)
|
|
#define zonefs_err(sb, format, args...) \
|
|
pr_err("zonefs (%s) ERROR: " format, sb->s_id, ## args)
|
|
#define zonefs_warn(sb, format, args...) \
|
|
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
|
|
|
|
/* In super.c */
|
|
void zonefs_inode_account_active(struct inode *inode);
|
|
int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op);
|
|
void zonefs_i_size_write(struct inode *inode, loff_t isize);
|
|
void zonefs_update_stats(struct inode *inode, loff_t new_isize);
|
|
void __zonefs_io_error(struct inode *inode, bool write);
|
|
|
|
static inline void zonefs_io_error(struct inode *inode, bool write)
|
|
{
|
|
struct zonefs_inode_info *zi = ZONEFS_I(inode);
|
|
|
|
mutex_lock(&zi->i_truncate_mutex);
|
|
__zonefs_io_error(inode, write);
|
|
mutex_unlock(&zi->i_truncate_mutex);
|
|
}
|
|
|
|
/* In super.c */
|
|
extern const struct inode_operations zonefs_dir_inode_operations;
|
|
extern const struct file_operations zonefs_dir_operations;
|
|
|
|
/* In file.c */
|
|
extern const struct address_space_operations zonefs_file_aops;
|
|
extern const struct file_operations zonefs_file_operations;
|
|
int zonefs_file_truncate(struct inode *inode, loff_t isize);
|
|
|
|
/* In sysfs.c */
|
|
int zonefs_sysfs_register(struct super_block *sb);
|
|
void zonefs_sysfs_unregister(struct super_block *sb);
|
|
int zonefs_sysfs_init(void);
|
|
void zonefs_sysfs_exit(void);
|
|
|
|
#endif
|