2019-07-15 15:50:59 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2010 Red Hat, Inc.
|
2023-12-07 07:27:09 +00:00
|
|
|
* Copyright (C) 2016-2023 Christoph Hellwig.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/iomap.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/uio.h>
|
|
|
|
#include <linux/buffer_head.h>
|
|
|
|
#include <linux/dax.h>
|
|
|
|
#include <linux/writeback.h>
|
2019-10-17 20:12:15 +00:00
|
|
|
#include <linux/list_sort.h>
|
2019-07-15 15:50:59 +00:00
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/sched/signal.h>
|
|
|
|
#include <linux/migrate.h>
|
2019-10-17 20:12:13 +00:00
|
|
|
#include "trace.h"
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
#include "../internal.h"
|
|
|
|
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
#define IOEND_BATCH_SIZE 4096
|
|
|
|
|
2019-10-17 20:12:19 +00:00
|
|
|
/*
|
2023-07-10 21:12:43 +00:00
|
|
|
* Structure allocated for each folio to track per-block uptodate, dirty state
|
2023-07-10 21:11:19 +00:00
|
|
|
* and I/O completions.
|
2019-10-17 20:12:19 +00:00
|
|
|
*/
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state {
|
|
|
|
spinlock_t state_lock;
|
2023-10-04 16:53:02 +00:00
|
|
|
unsigned int read_bytes_pending;
|
|
|
|
atomic_t write_bytes_pending;
|
2023-07-10 21:12:43 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Each block has two bits in this bitmap:
|
|
|
|
* Bits [0..blocks_per_folio) has the uptodate status.
|
|
|
|
* Bits [b_p_f...(2*b_p_f)) has the dirty status.
|
|
|
|
*/
|
2023-07-10 21:11:19 +00:00
|
|
|
unsigned long state[];
|
2019-10-17 20:12:19 +00:00
|
|
|
};
|
|
|
|
|
2019-10-17 20:12:15 +00:00
|
|
|
static struct bio_set iomap_ioend_bioset;
|
|
|
|
|
2023-07-10 21:12:22 +00:00
|
|
|
static inline bool ifs_is_fully_uptodate(struct folio *folio,
|
|
|
|
struct iomap_folio_state *ifs)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
|
|
|
|
return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
|
|
|
|
unsigned int block)
|
|
|
|
{
|
|
|
|
return test_bit(block, ifs->state);
|
|
|
|
}
|
|
|
|
|
2023-10-04 16:53:01 +00:00
|
|
|
static bool ifs_set_range_uptodate(struct folio *folio,
|
2023-07-10 21:12:21 +00:00
|
|
|
struct iomap_folio_state *ifs, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned int first_blk = off >> inode->i_blkbits;
|
|
|
|
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
|
|
|
|
unsigned int nr_blks = last_blk - first_blk + 1;
|
|
|
|
|
|
|
|
bitmap_set(ifs->state, first_blk, nr_blks);
|
2023-10-04 16:53:01 +00:00
|
|
|
return ifs_is_fully_uptodate(folio, ifs);
|
2023-07-10 21:12:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void iomap_set_range_uptodate(struct folio *folio, size_t off,
|
|
|
|
size_t len)
|
|
|
|
{
|
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2023-10-04 16:53:01 +00:00
|
|
|
unsigned long flags;
|
|
|
|
bool uptodate = true;
|
2023-07-10 21:12:21 +00:00
|
|
|
|
2023-10-04 16:53:01 +00:00
|
|
|
if (ifs) {
|
|
|
|
spin_lock_irqsave(&ifs->state_lock, flags);
|
|
|
|
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
|
|
|
|
spin_unlock_irqrestore(&ifs->state_lock, flags);
|
|
|
|
}
|
2023-07-10 21:12:21 +00:00
|
|
|
|
2023-10-04 16:53:01 +00:00
|
|
|
if (uptodate)
|
2023-07-10 21:12:21 +00:00
|
|
|
folio_mark_uptodate(folio);
|
|
|
|
}
|
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
static inline bool ifs_block_is_dirty(struct folio *folio,
|
|
|
|
struct iomap_folio_state *ifs, int block)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
|
|
|
|
|
|
|
|
return test_bit(block + blks_per_folio, ifs->state);
|
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:09 +00:00
|
|
|
static unsigned ifs_find_dirty_range(struct folio *folio,
|
|
|
|
struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned start_blk =
|
|
|
|
offset_in_folio(folio, *range_start) >> inode->i_blkbits;
|
|
|
|
unsigned end_blk = min_not_zero(
|
|
|
|
offset_in_folio(folio, range_end) >> inode->i_blkbits,
|
|
|
|
i_blocks_per_folio(inode, folio));
|
|
|
|
unsigned nblks = 1;
|
|
|
|
|
|
|
|
while (!ifs_block_is_dirty(folio, ifs, start_blk))
|
|
|
|
if (++start_blk == end_blk)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (start_blk + nblks < end_blk) {
|
|
|
|
if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
|
|
|
|
break;
|
|
|
|
nblks++;
|
|
|
|
}
|
|
|
|
|
|
|
|
*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
|
|
|
|
return nblks << inode->i_blkbits;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
|
|
|
|
u64 range_end)
|
|
|
|
{
|
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
|
|
|
|
|
|
|
if (*range_start >= range_end)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (ifs)
|
|
|
|
return ifs_find_dirty_range(folio, ifs, range_start, range_end);
|
|
|
|
return range_end - *range_start;
|
|
|
|
}
|
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
static void ifs_clear_range_dirty(struct folio *folio,
|
|
|
|
struct iomap_folio_state *ifs, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
|
|
|
|
unsigned int first_blk = (off >> inode->i_blkbits);
|
|
|
|
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
|
|
|
|
unsigned int nr_blks = last_blk - first_blk + 1;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&ifs->state_lock, flags);
|
|
|
|
bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
|
|
|
|
spin_unlock_irqrestore(&ifs->state_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
|
|
|
|
|
|
|
if (ifs)
|
|
|
|
ifs_clear_range_dirty(folio, ifs, off, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ifs_set_range_dirty(struct folio *folio,
|
|
|
|
struct iomap_folio_state *ifs, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
|
|
|
|
unsigned int first_blk = (off >> inode->i_blkbits);
|
|
|
|
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
|
|
|
|
unsigned int nr_blks = last_blk - first_blk + 1;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&ifs->state_lock, flags);
|
|
|
|
bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
|
|
|
|
spin_unlock_irqrestore(&ifs->state_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
|
|
|
|
|
|
|
if (ifs)
|
|
|
|
ifs_set_range_dirty(folio, ifs, off, len);
|
|
|
|
}
|
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
static struct iomap_folio_state *ifs_alloc(struct inode *inode,
|
|
|
|
struct folio *folio, unsigned int flags)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2021-04-28 03:12:52 +00:00
|
|
|
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
|
2022-06-23 17:51:47 +00:00
|
|
|
gfp_t gfp;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
if (ifs || nr_blocks <= 1)
|
|
|
|
return ifs;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-06-23 17:51:47 +00:00
|
|
|
if (flags & IOMAP_NOWAIT)
|
|
|
|
gfp = GFP_NOWAIT;
|
|
|
|
else
|
|
|
|
gfp = GFP_NOFS | __GFP_NOFAIL;
|
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
/*
|
|
|
|
* ifs->state tracks two sets of state flags when the
|
|
|
|
* filesystem block size is smaller than the folio size.
|
|
|
|
* The first state tracks per-block uptodate and the
|
|
|
|
* second tracks per-block dirty state.
|
|
|
|
*/
|
|
|
|
ifs = kzalloc(struct_size(ifs, state,
|
|
|
|
BITS_TO_LONGS(2 * nr_blocks)), gfp);
|
|
|
|
if (!ifs)
|
|
|
|
return ifs;
|
|
|
|
|
|
|
|
spin_lock_init(&ifs->state_lock);
|
|
|
|
if (folio_test_uptodate(folio))
|
|
|
|
bitmap_set(ifs->state, 0, nr_blocks);
|
|
|
|
if (folio_test_dirty(folio))
|
|
|
|
bitmap_set(ifs->state, nr_blocks, nr_blocks);
|
|
|
|
folio_attach_private(folio, ifs);
|
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
return ifs;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
static void ifs_free(struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio_detach_private(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
if (!ifs)
|
2019-07-15 15:50:59 +00:00
|
|
|
return;
|
2023-10-04 16:53:02 +00:00
|
|
|
WARN_ON_ONCE(ifs->read_bytes_pending != 0);
|
2023-07-10 21:11:19 +00:00
|
|
|
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
|
2023-07-10 21:12:22 +00:00
|
|
|
WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
|
2021-04-28 03:22:22 +00:00
|
|
|
folio_test_uptodate(folio));
|
2023-07-10 21:11:19 +00:00
|
|
|
kfree(ifs);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-04-28 12:20:48 +00:00
|
|
|
* Calculate the range inside the folio that we actually need to read.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2021-04-28 12:20:48 +00:00
|
|
|
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
|
|
|
|
loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2019-07-15 15:50:59 +00:00
|
|
|
loff_t orig_pos = *pos;
|
|
|
|
loff_t isize = i_size_read(inode);
|
|
|
|
unsigned block_bits = inode->i_blkbits;
|
|
|
|
unsigned block_size = (1 << block_bits);
|
2021-04-28 12:20:48 +00:00
|
|
|
size_t poff = offset_in_folio(folio, *pos);
|
|
|
|
size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
|
2024-05-07 08:55:42 +00:00
|
|
|
size_t orig_plen = plen;
|
2019-07-15 15:50:59 +00:00
|
|
|
unsigned first = poff >> block_bits;
|
|
|
|
unsigned last = (poff + plen - 1) >> block_bits;
|
|
|
|
|
|
|
|
/*
|
2021-08-02 21:46:31 +00:00
|
|
|
* If the block size is smaller than the page size, we need to check the
|
2019-07-15 15:50:59 +00:00
|
|
|
* per-block uptodate status and adjust the offset and length if needed
|
|
|
|
* to avoid reading in already uptodate ranges.
|
|
|
|
*/
|
2023-07-10 21:11:19 +00:00
|
|
|
if (ifs) {
|
2019-07-15 15:50:59 +00:00
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
/* move forward for each leading block marked uptodate */
|
|
|
|
for (i = first; i <= last; i++) {
|
2023-07-10 21:12:22 +00:00
|
|
|
if (!ifs_block_is_uptodate(ifs, i))
|
2019-07-15 15:50:59 +00:00
|
|
|
break;
|
|
|
|
*pos += block_size;
|
|
|
|
poff += block_size;
|
|
|
|
plen -= block_size;
|
|
|
|
first++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* truncate len if we find any trailing uptodate block(s) */
|
|
|
|
for ( ; i <= last; i++) {
|
2023-07-10 21:12:22 +00:00
|
|
|
if (ifs_block_is_uptodate(ifs, i)) {
|
2019-07-15 15:50:59 +00:00
|
|
|
plen -= (last - i + 1) * block_size;
|
|
|
|
last = i - 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-08-02 21:46:31 +00:00
|
|
|
* If the extent spans the block that contains the i_size, we need to
|
2019-07-15 15:50:59 +00:00
|
|
|
* handle both halves separately so that we properly zero data in the
|
|
|
|
* page cache for blocks that are entirely outside of i_size.
|
|
|
|
*/
|
2024-05-07 08:55:42 +00:00
|
|
|
if (orig_pos <= isize && orig_pos + orig_plen > isize) {
|
2021-04-28 12:20:48 +00:00
|
|
|
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
if (first <= end && last > end)
|
|
|
|
plen -= (last - end) * block_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
*offp = poff;
|
|
|
|
*lenp = plen;
|
|
|
|
}
|
|
|
|
|
2023-10-04 16:53:02 +00:00
|
|
|
static void iomap_finish_folio_read(struct folio *folio, size_t off,
|
2021-01-01 21:53:26 +00:00
|
|
|
size_t len, int error)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2023-10-04 16:53:02 +00:00
|
|
|
bool uptodate = !error;
|
|
|
|
bool finished = true;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-10-04 16:53:02 +00:00
|
|
|
if (ifs) {
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&ifs->state_lock, flags);
|
|
|
|
if (!error)
|
|
|
|
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
|
|
|
|
ifs->read_bytes_pending -= len;
|
|
|
|
finished = !ifs->read_bytes_pending;
|
|
|
|
spin_unlock_irqrestore(&ifs->state_lock, flags);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2023-10-04 16:53:02 +00:00
|
|
|
if (finished)
|
2023-10-04 16:53:06 +00:00
|
|
|
folio_end_read(folio, uptodate);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2021-01-01 21:53:26 +00:00
|
|
|
static void iomap_read_end_io(struct bio *bio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
|
|
|
int error = blk_status_to_errno(bio->bi_status);
|
2021-01-01 21:53:26 +00:00
|
|
|
struct folio_iter fi;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-01-01 21:53:26 +00:00
|
|
|
bio_for_each_folio_all(fi, bio)
|
|
|
|
iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
|
2019-07-15 15:50:59 +00:00
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct iomap_readpage_ctx {
|
2021-04-28 13:39:51 +00:00
|
|
|
struct folio *cur_folio;
|
|
|
|
bool cur_folio_in_bio;
|
2019-07-15 15:50:59 +00:00
|
|
|
struct bio *bio;
|
2020-06-02 04:47:34 +00:00
|
|
|
struct readahead_control *rac;
|
2019-07-15 15:50:59 +00:00
|
|
|
};
|
|
|
|
|
2021-11-24 18:15:47 +00:00
|
|
|
/**
|
|
|
|
* iomap_read_inline_data - copy inline data into the page cache
|
|
|
|
* @iter: iteration structure
|
2021-07-24 03:24:50 +00:00
|
|
|
* @folio: folio to copy to
|
2021-11-24 18:15:47 +00:00
|
|
|
*
|
2021-07-24 03:24:50 +00:00
|
|
|
* Copy the inline data in @iter into @folio and zero out the rest of the folio.
|
2021-11-24 18:15:47 +00:00
|
|
|
* Only a single IOMAP_INLINE extent is allowed at the end of each file.
|
|
|
|
* Returns zero for success to complete the read, or the usual negative errno.
|
|
|
|
*/
|
|
|
|
static int iomap_read_inline_data(const struct iomap_iter *iter,
|
2021-07-24 03:24:50 +00:00
|
|
|
struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *iomap = iomap_iter_srcmap(iter);
|
2021-08-11 01:33:14 +00:00
|
|
|
size_t size = i_size_read(iter->inode) - iomap->offset;
|
2021-04-28 12:20:48 +00:00
|
|
|
size_t offset = offset_in_folio(folio, iomap->offset);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-07-24 03:24:50 +00:00
|
|
|
if (folio_test_uptodate(folio))
|
2021-11-24 18:15:47 +00:00
|
|
|
return 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-08-03 16:38:22 +00:00
|
|
|
if (WARN_ON_ONCE(size > iomap->length))
|
|
|
|
return -EIO;
|
2021-04-28 12:20:48 +00:00
|
|
|
if (offset > 0)
|
2023-07-10 21:12:21 +00:00
|
|
|
ifs_alloc(iter->inode, folio, iter->flags);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-11-07 21:26:41 +00:00
|
|
|
folio_fill_tail(folio, offset, iomap->inline_data, size);
|
|
|
|
iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
|
2021-11-24 18:15:47 +00:00
|
|
|
return 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2021-08-11 01:33:16 +00:00
|
|
|
static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
|
2021-08-11 01:33:14 +00:00
|
|
|
loff_t pos)
|
2019-10-17 20:12:12 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
2021-08-11 01:33:14 +00:00
|
|
|
|
|
|
|
return srcmap->type != IOMAP_MAPPED ||
|
|
|
|
(srcmap->flags & IOMAP_F_NEW) ||
|
|
|
|
pos >= i_size_read(iter->inode);
|
2019-10-17 20:12:12 +00:00
|
|
|
}
|
|
|
|
|
2021-08-11 01:33:16 +00:00
|
|
|
static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
|
2021-08-11 01:33:08 +00:00
|
|
|
struct iomap_readpage_ctx *ctx, loff_t offset)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *iomap = &iter->iomap;
|
2021-08-11 01:33:08 +00:00
|
|
|
loff_t pos = iter->pos + offset;
|
|
|
|
loff_t length = iomap_length(iter) - offset;
|
2021-04-28 13:39:51 +00:00
|
|
|
struct folio *folio = ctx->cur_folio;
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs;
|
2019-07-15 15:50:59 +00:00
|
|
|
loff_t orig_pos = pos;
|
2021-04-28 12:20:48 +00:00
|
|
|
size_t poff, plen;
|
2019-07-15 15:50:59 +00:00
|
|
|
sector_t sector;
|
|
|
|
|
2021-11-24 18:15:47 +00:00
|
|
|
if (iomap->type == IOMAP_INLINE)
|
2021-07-24 03:24:50 +00:00
|
|
|
return iomap_read_inline_data(iter, folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
/* zero post-eof blocks as the page may be mapped */
|
2023-07-10 21:11:19 +00:00
|
|
|
ifs = ifs_alloc(iter->inode, folio, iter->flags);
|
2021-04-28 12:20:48 +00:00
|
|
|
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
|
2019-07-15 15:50:59 +00:00
|
|
|
if (plen == 0)
|
|
|
|
goto done;
|
|
|
|
|
2021-08-11 01:33:14 +00:00
|
|
|
if (iomap_block_needs_zeroing(iter, pos)) {
|
2021-04-28 12:20:48 +00:00
|
|
|
folio_zero_range(folio, poff, plen);
|
2023-07-10 21:12:21 +00:00
|
|
|
iomap_set_range_uptodate(folio, poff, plen);
|
2019-07-15 15:50:59 +00:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2021-04-28 13:39:51 +00:00
|
|
|
ctx->cur_folio_in_bio = true;
|
2023-10-04 16:53:02 +00:00
|
|
|
if (ifs) {
|
|
|
|
spin_lock_irq(&ifs->state_lock);
|
|
|
|
ifs->read_bytes_pending += plen;
|
|
|
|
spin_unlock_irq(&ifs->state_lock);
|
|
|
|
}
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
sector = iomap_sector(iomap, pos);
|
2021-08-02 21:43:43 +00:00
|
|
|
if (!ctx->bio ||
|
|
|
|
bio_end_sector(ctx->bio) != sector ||
|
2021-04-28 12:20:48 +00:00
|
|
|
!bio_add_folio(ctx->bio, folio, plen, poff)) {
|
2021-04-28 13:39:51 +00:00
|
|
|
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
|
2020-04-02 16:08:53 +00:00
|
|
|
gfp_t orig_gfp = gfp;
|
2021-01-29 04:38:57 +00:00
|
|
|
unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
if (ctx->bio)
|
|
|
|
submit_bio(ctx->bio);
|
|
|
|
|
2020-06-02 04:47:34 +00:00
|
|
|
if (ctx->rac) /* same as readahead_gfp_mask */
|
2019-07-15 15:50:59 +00:00
|
|
|
gfp |= __GFP_NORETRY | __GFP_NOWARN;
|
2022-01-24 09:11:05 +00:00
|
|
|
ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
|
|
|
|
REQ_OP_READ, gfp);
|
2020-04-02 16:08:53 +00:00
|
|
|
/*
|
|
|
|
* If the bio_alloc fails, try it again for a single page to
|
|
|
|
* avoid having to deal with partial page reads. This emulates
|
2022-04-29 15:47:39 +00:00
|
|
|
* what do_mpage_read_folio does.
|
2020-04-02 16:08:53 +00:00
|
|
|
*/
|
2022-01-24 09:11:05 +00:00
|
|
|
if (!ctx->bio) {
|
|
|
|
ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
|
|
|
|
orig_gfp);
|
|
|
|
}
|
2020-06-02 04:47:34 +00:00
|
|
|
if (ctx->rac)
|
2019-07-15 15:50:59 +00:00
|
|
|
ctx->bio->bi_opf |= REQ_RAHEAD;
|
|
|
|
ctx->bio->bi_iter.bi_sector = sector;
|
|
|
|
ctx->bio->bi_end_io = iomap_read_end_io;
|
2023-05-31 11:50:42 +00:00
|
|
|
bio_add_folio_nofail(ctx->bio, folio, plen, poff);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2021-04-28 12:20:48 +00:00
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
done:
|
|
|
|
/*
|
|
|
|
* Move the caller beyond our range so that it keeps making progress.
|
2021-08-02 21:46:31 +00:00
|
|
|
* For that, we have to include any leading non-uptodate ranges, but
|
2019-07-15 15:50:59 +00:00
|
|
|
* we can skip trailing ones as they will be handled in the next
|
|
|
|
* iteration.
|
|
|
|
*/
|
|
|
|
return pos - orig_pos + plen;
|
|
|
|
}
|
|
|
|
|
2024-05-07 08:55:43 +00:00
|
|
|
static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
|
|
|
|
struct iomap_readpage_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct folio *folio = ctx->cur_folio;
|
|
|
|
size_t offset = offset_in_folio(folio, iter->pos);
|
|
|
|
loff_t length = min_t(loff_t, folio_size(folio) - offset,
|
|
|
|
iomap_length(iter));
|
|
|
|
loff_t done, ret;
|
|
|
|
|
|
|
|
for (done = 0; done < length; done += ret) {
|
|
|
|
ret = iomap_readpage_iter(iter, ctx, done);
|
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return done;
|
|
|
|
}
|
|
|
|
|
2022-04-29 12:54:32 +00:00
|
|
|
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:08 +00:00
|
|
|
struct iomap_iter iter = {
|
2021-04-28 13:39:51 +00:00
|
|
|
.inode = folio->mapping->host,
|
|
|
|
.pos = folio_pos(folio),
|
|
|
|
.len = folio_size(folio),
|
2021-08-11 01:33:08 +00:00
|
|
|
};
|
|
|
|
struct iomap_readpage_ctx ctx = {
|
2021-04-28 13:39:51 +00:00
|
|
|
.cur_folio = folio,
|
2021-08-11 01:33:08 +00:00
|
|
|
};
|
|
|
|
int ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-04-28 13:39:51 +00:00
|
|
|
trace_iomap_readpage(iter.inode, 1);
|
2019-10-17 20:12:13 +00:00
|
|
|
|
2021-08-11 01:33:08 +00:00
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
2024-05-07 08:55:43 +00:00
|
|
|
iter.processed = iomap_read_folio_iter(&iter, &ctx);
|
2021-08-11 01:33:08 +00:00
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
if (ctx.bio) {
|
|
|
|
submit_bio(ctx.bio);
|
2021-04-28 13:39:51 +00:00
|
|
|
WARN_ON_ONCE(!ctx.cur_folio_in_bio);
|
2019-07-15 15:50:59 +00:00
|
|
|
} else {
|
2021-04-28 13:39:51 +00:00
|
|
|
WARN_ON_ONCE(ctx.cur_folio_in_bio);
|
|
|
|
folio_unlock(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-04-29 14:40:40 +00:00
|
|
|
* Just like mpage_readahead and block_read_full_folio, we always
|
2022-04-29 12:54:32 +00:00
|
|
|
* return 0 and just set the folio error flag on errors. This
|
2021-08-02 21:46:31 +00:00
|
|
|
* should be cleaned up throughout the stack eventually.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
|
|
|
return 0;
|
|
|
|
}
|
2022-04-29 12:54:32 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_read_folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-08-11 01:33:16 +00:00
|
|
|
static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
|
2021-08-11 01:33:08 +00:00
|
|
|
struct iomap_readpage_ctx *ctx)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:08 +00:00
|
|
|
loff_t length = iomap_length(iter);
|
2019-07-15 15:50:59 +00:00
|
|
|
loff_t done, ret;
|
|
|
|
|
|
|
|
for (done = 0; done < length; done += ret) {
|
2021-04-28 13:39:51 +00:00
|
|
|
if (ctx->cur_folio &&
|
|
|
|
offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
|
|
|
|
if (!ctx->cur_folio_in_bio)
|
|
|
|
folio_unlock(ctx->cur_folio);
|
|
|
|
ctx->cur_folio = NULL;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2021-04-28 13:39:51 +00:00
|
|
|
if (!ctx->cur_folio) {
|
|
|
|
ctx->cur_folio = readahead_folio(ctx->rac);
|
|
|
|
ctx->cur_folio_in_bio = false;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2021-08-11 01:33:08 +00:00
|
|
|
ret = iomap_readpage_iter(iter, ctx, done);
|
2021-11-18 01:59:01 +00:00
|
|
|
if (ret <= 0)
|
|
|
|
return ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return done;
|
|
|
|
}
|
|
|
|
|
2020-06-02 04:47:34 +00:00
|
|
|
/**
|
|
|
|
* iomap_readahead - Attempt to read pages from a file.
|
|
|
|
* @rac: Describes the pages to be read.
|
|
|
|
* @ops: The operations vector for the filesystem.
|
|
|
|
*
|
|
|
|
* This function is for filesystems to call to implement their readahead
|
|
|
|
* address_space operation.
|
|
|
|
*
|
|
|
|
* Context: The @ops callbacks may submit I/O (eg to read the addresses of
|
|
|
|
* blocks from disc), and may wait for it. The caller may be trying to
|
|
|
|
* access a different page, and so sleeping excessively should be avoided.
|
|
|
|
* It may allocate memory, but should avoid costly allocations. This
|
|
|
|
* function is called with memalloc_nofs set, so allocations will not cause
|
|
|
|
* the filesystem to be reentered.
|
|
|
|
*/
|
|
|
|
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:08 +00:00
|
|
|
struct iomap_iter iter = {
|
|
|
|
.inode = rac->mapping->host,
|
|
|
|
.pos = readahead_pos(rac),
|
|
|
|
.len = readahead_length(rac),
|
|
|
|
};
|
2019-07-15 15:50:59 +00:00
|
|
|
struct iomap_readpage_ctx ctx = {
|
2020-06-02 04:47:34 +00:00
|
|
|
.rac = rac,
|
2019-07-15 15:50:59 +00:00
|
|
|
};
|
|
|
|
|
2021-08-11 01:33:08 +00:00
|
|
|
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
|
2019-10-17 20:12:13 +00:00
|
|
|
|
2021-08-11 01:33:08 +00:00
|
|
|
while (iomap_iter(&iter, ops) > 0)
|
|
|
|
iter.processed = iomap_readahead_iter(&iter, &ctx);
|
2020-06-02 04:47:34 +00:00
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
if (ctx.bio)
|
|
|
|
submit_bio(ctx.bio);
|
2021-04-28 13:39:51 +00:00
|
|
|
if (ctx.cur_folio) {
|
|
|
|
if (!ctx.cur_folio_in_bio)
|
|
|
|
folio_unlock(ctx.cur_folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
}
|
2020-06-02 04:47:34 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_readahead);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
/*
|
2022-02-09 20:21:27 +00:00
|
|
|
* iomap_is_partially_uptodate checks whether blocks within a folio are
|
2019-07-15 15:50:59 +00:00
|
|
|
* uptodate or not.
|
|
|
|
*
|
2022-02-09 20:21:27 +00:00
|
|
|
* Returns true if all blocks which correspond to the specified part
|
|
|
|
* of the folio are uptodate.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2022-02-09 20:21:27 +00:00
|
|
|
bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2022-02-09 20:21:27 +00:00
|
|
|
struct inode *inode = folio->mapping->host;
|
|
|
|
unsigned first, last, i;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
if (!ifs)
|
2022-02-09 20:21:27 +00:00
|
|
|
return false;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-03-04 19:05:23 +00:00
|
|
|
/* Caller's range may extend past the end of this folio */
|
|
|
|
count = min(folio_size(folio) - from, count);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-03-04 19:05:23 +00:00
|
|
|
/* First and last blocks in range within folio */
|
2019-07-15 15:50:59 +00:00
|
|
|
first = from >> inode->i_blkbits;
|
2022-03-04 19:05:23 +00:00
|
|
|
last = (from + count - 1) >> inode->i_blkbits;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-02-09 20:21:27 +00:00
|
|
|
for (i = first; i <= last; i++)
|
2023-07-10 21:12:22 +00:00
|
|
|
if (!ifs_block_is_uptodate(ifs, i))
|
2022-02-09 20:21:27 +00:00
|
|
|
return false;
|
|
|
|
return true;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
|
|
|
|
|
2023-01-15 16:49:04 +00:00
|
|
|
/**
|
|
|
|
* iomap_get_folio - get a folio reference for writing
|
|
|
|
* @iter: iteration structure
|
|
|
|
* @pos: start offset of write
|
2023-05-19 20:18:05 +00:00
|
|
|
* @len: Suggested size of folio to create.
|
2023-01-15 16:49:04 +00:00
|
|
|
*
|
|
|
|
* Returns a locked reference to the folio at @pos, or an error pointer if the
|
|
|
|
* folio could not be obtained.
|
|
|
|
*/
|
2023-05-19 20:18:05 +00:00
|
|
|
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
|
2023-01-15 16:49:04 +00:00
|
|
|
{
|
2023-05-26 20:43:23 +00:00
|
|
|
fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
|
2023-01-15 16:49:04 +00:00
|
|
|
|
|
|
|
if (iter->flags & IOMAP_NOWAIT)
|
|
|
|
fgp |= FGP_NOWAIT;
|
2023-05-19 20:18:05 +00:00
|
|
|
fgp |= fgf_set_order(len);
|
2023-01-15 16:49:04 +00:00
|
|
|
|
2023-03-07 14:34:10 +00:00
|
|
|
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
|
2023-01-15 16:49:04 +00:00
|
|
|
fgp, mapping_gfp_mask(iter->inode->i_mapping));
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_get_folio);
|
|
|
|
|
2022-05-01 03:01:08 +00:00
|
|
|
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2022-05-01 03:01:08 +00:00
|
|
|
trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
|
2021-04-28 11:51:36 +00:00
|
|
|
folio_size(folio));
|
2019-10-17 20:12:13 +00:00
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
/*
|
2023-06-02 22:07:48 +00:00
|
|
|
* If the folio is dirty, we refuse to release our metadata because
|
|
|
|
* it may be partially dirty. Once we track per-block dirty state,
|
|
|
|
* we can release the metadata if every block is dirty.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2023-06-02 22:07:48 +00:00
|
|
|
if (folio_test_dirty(folio))
|
2022-05-01 03:01:08 +00:00
|
|
|
return false;
|
2023-07-10 21:11:19 +00:00
|
|
|
ifs_free(folio);
|
2022-05-01 03:01:08 +00:00
|
|
|
return true;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2022-05-01 03:01:08 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_release_folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-04-28 11:51:36 +00:00
|
|
|
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2022-02-09 20:21:33 +00:00
|
|
|
trace_iomap_invalidate_folio(folio->mapping->host,
|
2022-02-09 20:21:22 +00:00
|
|
|
folio_pos(folio) + offset, len);
|
2019-10-17 20:12:13 +00:00
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
/*
|
2021-01-13 15:48:49 +00:00
|
|
|
* If we're invalidating the entire folio, clear the dirty state
|
|
|
|
* from it and release it to avoid unnecessary buildup of the LRU.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2021-04-28 11:51:36 +00:00
|
|
|
if (offset == 0 && len == folio_size(folio)) {
|
|
|
|
WARN_ON_ONCE(folio_test_writeback(folio));
|
|
|
|
folio_cancel_dirty(folio);
|
2023-07-10 21:11:19 +00:00
|
|
|
ifs_free(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
}
|
2021-04-28 11:51:36 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
|
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
size_t len = folio_size(folio);
|
|
|
|
|
|
|
|
ifs_alloc(inode, folio, 0);
|
|
|
|
iomap_set_range_dirty(folio, 0, len);
|
|
|
|
return filemap_dirty_folio(mapping, folio);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_dirty_folio);
|
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
static void
|
|
|
|
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
|
|
|
|
{
|
|
|
|
loff_t i_size = i_size_read(inode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only truncate newly allocated pages beyoned EOF, even if the
|
|
|
|
* write started inside the existing inode size.
|
|
|
|
*/
|
|
|
|
if (pos + len > i_size)
|
2022-05-06 01:19:13 +00:00
|
|
|
truncate_pagecache_range(inode, max(pos, i_size),
|
|
|
|
pos + len - 1);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2021-04-28 12:20:48 +00:00
|
|
|
static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
|
|
|
|
size_t poff, size_t plen, const struct iomap *iomap)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
|
|
|
struct bio_vec bvec;
|
|
|
|
struct bio bio;
|
|
|
|
|
2022-01-24 09:11:06 +00:00
|
|
|
bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
|
2019-07-15 15:50:59 +00:00
|
|
|
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
|
2023-05-31 11:50:42 +00:00
|
|
|
bio_add_folio_nofail(&bio, folio, plen, poff);
|
2019-07-15 15:50:59 +00:00
|
|
|
return submit_bio_wait(&bio);
|
|
|
|
}
|
|
|
|
|
2021-08-11 01:33:16 +00:00
|
|
|
static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
2021-05-02 15:33:08 +00:00
|
|
|
size_t len, struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs;
|
2021-08-11 01:33:14 +00:00
|
|
|
loff_t block_size = i_blocksize(iter->inode);
|
2020-09-10 15:38:06 +00:00
|
|
|
loff_t block_start = round_down(pos, block_size);
|
|
|
|
loff_t block_end = round_up(pos + len, block_size);
|
2022-06-23 17:51:48 +00:00
|
|
|
unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
|
2021-04-28 12:20:48 +00:00
|
|
|
size_t from = offset_in_folio(folio, pos), to = from + len;
|
|
|
|
size_t poff, plen;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-07-10 21:12:24 +00:00
|
|
|
/*
|
2023-09-18 22:57:39 +00:00
|
|
|
* If the write or zeroing completely overlaps the current folio, then
|
2023-07-10 21:12:24 +00:00
|
|
|
* entire folio will be dirtied so there is no need for
|
|
|
|
* per-block state tracking structures to be attached to this folio.
|
2023-09-18 22:57:39 +00:00
|
|
|
* For the unshare case, we must read in the ondisk contents because we
|
|
|
|
* are not changing pagecache contents.
|
2023-07-10 21:12:24 +00:00
|
|
|
*/
|
2023-09-18 22:57:39 +00:00
|
|
|
if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
|
2023-07-10 21:12:24 +00:00
|
|
|
pos + len >= folio_pos(folio) + folio_size(folio))
|
2019-07-15 15:50:59 +00:00
|
|
|
return 0;
|
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
ifs = ifs_alloc(iter->inode, folio, iter->flags);
|
|
|
|
if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
|
2022-06-23 17:51:48 +00:00
|
|
|
return -EAGAIN;
|
2022-06-23 17:51:47 +00:00
|
|
|
|
2023-07-10 21:12:24 +00:00
|
|
|
if (folio_test_uptodate(folio))
|
|
|
|
return 0;
|
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
do {
|
2021-04-28 12:20:48 +00:00
|
|
|
iomap_adjust_read_range(iter->inode, folio, &block_start,
|
2019-07-15 15:50:59 +00:00
|
|
|
block_end - block_start, &poff, &plen);
|
|
|
|
if (plen == 0)
|
|
|
|
break;
|
|
|
|
|
2021-08-11 01:33:14 +00:00
|
|
|
if (!(iter->flags & IOMAP_UNSHARE) &&
|
2019-10-18 23:42:50 +00:00
|
|
|
(from <= poff || from >= poff + plen) &&
|
2019-10-18 23:42:24 +00:00
|
|
|
(to <= poff || to >= poff + plen))
|
|
|
|
continue;
|
|
|
|
|
2021-08-11 01:33:14 +00:00
|
|
|
if (iomap_block_needs_zeroing(iter, block_start)) {
|
2021-08-11 01:33:14 +00:00
|
|
|
if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
|
2019-10-18 23:42:50 +00:00
|
|
|
return -EIO;
|
2021-04-28 12:20:48 +00:00
|
|
|
folio_zero_segments(folio, poff, from, to, poff + plen);
|
2020-09-10 15:26:18 +00:00
|
|
|
} else {
|
2022-06-23 17:51:48 +00:00
|
|
|
int status;
|
|
|
|
|
|
|
|
if (iter->flags & IOMAP_NOWAIT)
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
status = iomap_read_folio_sync(block_start, folio,
|
2020-09-10 15:26:18 +00:00
|
|
|
poff, plen, srcmap);
|
|
|
|
if (status)
|
|
|
|
return status;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2023-07-10 21:12:21 +00:00
|
|
|
iomap_set_range_uptodate(folio, poff, plen);
|
2019-07-15 15:50:59 +00:00
|
|
|
} while ((block_start += plen) < block_end);
|
|
|
|
|
2019-10-18 23:42:24 +00:00
|
|
|
return 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2023-01-15 16:50:02 +00:00
|
|
|
static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
|
|
|
|
size_t len)
|
|
|
|
{
|
2023-01-15 16:50:44 +00:00
|
|
|
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
|
2023-01-15 16:50:02 +00:00
|
|
|
|
2023-01-15 16:50:44 +00:00
|
|
|
if (folio_ops && folio_ops->get_folio)
|
|
|
|
return folio_ops->get_folio(iter, pos, len);
|
2023-01-15 16:50:02 +00:00
|
|
|
else
|
2023-05-19 20:18:05 +00:00
|
|
|
return iomap_get_folio(iter, pos, len);
|
2023-01-15 16:50:02 +00:00
|
|
|
}
|
|
|
|
|
2023-01-15 16:45:50 +00:00
|
|
|
static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
|
|
|
|
struct folio *folio)
|
|
|
|
{
|
2023-01-15 16:50:44 +00:00
|
|
|
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
|
2023-01-15 16:45:50 +00:00
|
|
|
|
2023-01-15 16:50:44 +00:00
|
|
|
if (folio_ops && folio_ops->put_folio) {
|
|
|
|
folio_ops->put_folio(iter->inode, pos, ret, folio);
|
2023-01-15 16:49:12 +00:00
|
|
|
} else {
|
2023-01-15 16:45:50 +00:00
|
|
|
folio_unlock(folio);
|
|
|
|
folio_put(folio);
|
2023-01-15 16:45:51 +00:00
|
|
|
}
|
2023-01-15 16:45:50 +00:00
|
|
|
}
|
|
|
|
|
2021-08-11 01:33:16 +00:00
|
|
|
static int iomap_write_begin_inline(const struct iomap_iter *iter,
|
2021-05-02 15:33:08 +00:00
|
|
|
struct folio *folio)
|
2021-08-03 16:38:22 +00:00
|
|
|
{
|
|
|
|
/* needs more work for the tailpacking case; disable for now */
|
2021-08-11 01:33:14 +00:00
|
|
|
if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0))
|
2021-08-03 16:38:22 +00:00
|
|
|
return -EIO;
|
2021-07-24 03:24:50 +00:00
|
|
|
return iomap_read_inline_data(iter, folio);
|
2021-08-03 16:38:22 +00:00
|
|
|
}
|
|
|
|
|
2022-11-28 22:09:17 +00:00
|
|
|
static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
|
2021-05-02 15:33:08 +00:00
|
|
|
size_t len, struct folio **foliop)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2023-01-15 16:50:44 +00:00
|
|
|
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
2021-11-03 18:05:47 +00:00
|
|
|
struct folio *folio;
|
2019-07-15 15:50:59 +00:00
|
|
|
int status = 0;
|
|
|
|
|
2021-08-11 01:33:14 +00:00
|
|
|
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
|
|
|
|
if (srcmap != &iter->iomap)
|
2019-10-18 23:44:10 +00:00
|
|
|
BUG_ON(pos + len > srcmap->offset + srcmap->length);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -EINTR;
|
|
|
|
|
2021-12-09 20:47:44 +00:00
|
|
|
if (!mapping_large_folio_support(iter->inode->i_mapping))
|
|
|
|
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
|
|
|
|
|
2023-01-15 16:50:02 +00:00
|
|
|
folio = __iomap_get_folio(iter, pos, len);
|
2023-01-15 16:49:12 +00:00
|
|
|
if (IS_ERR(folio))
|
2023-01-15 16:49:04 +00:00
|
|
|
return PTR_ERR(folio);
|
2022-11-28 22:09:17 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Now we have a locked folio, before we do anything with it we need to
|
|
|
|
* check that the iomap we have cached is not stale. The inode extent
|
|
|
|
* mapping can change due to concurrent IO in flight (e.g.
|
|
|
|
* IOMAP_UNWRITTEN state can change and memory reclaim could have
|
|
|
|
* reclaimed a previously partially written page at this index after IO
|
|
|
|
* completion before this write reaches this file offset) and hence we
|
|
|
|
* could do the wrong thing here (zero a page range incorrectly or fail
|
|
|
|
* to zero) and corrupt data.
|
|
|
|
*/
|
2023-01-15 16:50:44 +00:00
|
|
|
if (folio_ops && folio_ops->iomap_valid) {
|
|
|
|
bool iomap_valid = folio_ops->iomap_valid(iter->inode,
|
|
|
|
&iter->iomap);
|
2022-11-28 22:09:17 +00:00
|
|
|
if (!iomap_valid) {
|
|
|
|
iter->iomap.flags |= IOMAP_F_STALE;
|
|
|
|
status = 0;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-09 20:47:44 +00:00
|
|
|
if (pos + len > folio_pos(folio) + folio_size(folio))
|
|
|
|
len = folio_pos(folio) + folio_size(folio) - pos;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2019-10-18 23:44:10 +00:00
|
|
|
if (srcmap->type == IOMAP_INLINE)
|
2021-05-02 15:33:08 +00:00
|
|
|
status = iomap_write_begin_inline(iter, folio);
|
2021-08-11 01:33:14 +00:00
|
|
|
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
|
2021-11-03 18:05:47 +00:00
|
|
|
status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
|
2019-07-15 15:50:59 +00:00
|
|
|
else
|
2021-05-02 15:33:08 +00:00
|
|
|
status = __iomap_write_begin(iter, pos, len, folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
if (unlikely(status))
|
|
|
|
goto out_unlock;
|
|
|
|
|
2021-05-02 15:33:08 +00:00
|
|
|
*foliop = folio;
|
2019-07-15 15:50:59 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_unlock:
|
2023-01-15 16:45:50 +00:00
|
|
|
__iomap_put_folio(iter, pos, 0, folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
|
2021-05-02 15:33:08 +00:00
|
|
|
size_t copied, struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-05-02 15:33:08 +00:00
|
|
|
flush_dcache_folio(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The blocks that were entirely written will now be uptodate, so we
|
2022-04-29 12:54:32 +00:00
|
|
|
* don't have to worry about a read_folio reading them and overwriting a
|
2021-08-02 21:46:31 +00:00
|
|
|
* partial write. However, if we've encountered a short write and only
|
2019-07-15 15:50:59 +00:00
|
|
|
* partially written into a block, it will not be marked uptodate, so a
|
2022-04-29 12:54:32 +00:00
|
|
|
* read_folio might come in and destroy our partial write.
|
2019-07-15 15:50:59 +00:00
|
|
|
*
|
2021-08-02 21:46:31 +00:00
|
|
|
* Do the simplest thing and just treat any short write to a
|
|
|
|
* non-uptodate page as a zero-length write, and force the caller to
|
|
|
|
* redo the whole thing.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2021-05-02 15:33:08 +00:00
|
|
|
if (unlikely(copied < len && !folio_test_uptodate(folio)))
|
2024-03-20 11:05:47 +00:00
|
|
|
return false;
|
2023-07-10 21:12:21 +00:00
|
|
|
iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
|
2023-07-10 21:12:43 +00:00
|
|
|
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
|
2021-05-02 15:33:08 +00:00
|
|
|
filemap_dirty_folio(inode->i_mapping, folio);
|
2024-03-20 11:05:47 +00:00
|
|
|
return true;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
static void iomap_write_end_inline(const struct iomap_iter *iter,
|
2021-05-02 15:44:44 +00:00
|
|
|
struct folio *folio, loff_t pos, size_t copied)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *iomap = &iter->iomap;
|
2019-07-15 15:50:59 +00:00
|
|
|
void *addr;
|
|
|
|
|
2021-05-02 15:44:44 +00:00
|
|
|
WARN_ON_ONCE(!folio_test_uptodate(folio));
|
2021-08-03 16:38:22 +00:00
|
|
|
BUG_ON(!iomap_inline_data_valid(iomap));
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-05-02 15:44:44 +00:00
|
|
|
flush_dcache_folio(folio);
|
|
|
|
addr = kmap_local_folio(folio, pos);
|
2021-08-05 03:07:33 +00:00
|
|
|
memcpy(iomap_inline_data(iomap, pos), addr, copied);
|
|
|
|
kunmap_local(addr);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-08-11 01:33:14 +00:00
|
|
|
mark_inode_dirty(iter->inode);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
/*
|
|
|
|
* Returns true if all copied bytes have been written to the pagecache,
|
|
|
|
* otherwise return false.
|
|
|
|
*/
|
|
|
|
static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
|
2021-05-02 15:33:08 +00:00
|
|
|
size_t copied, struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:16 +00:00
|
|
|
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
if (srcmap->type == IOMAP_INLINE) {
|
|
|
|
iomap_write_end_inline(iter, folio, pos, copied);
|
2024-06-18 14:21:12 +00:00
|
|
|
return true;
|
2024-03-20 11:05:47 +00:00
|
|
|
}
|
|
|
|
|
2024-06-18 14:21:12 +00:00
|
|
|
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
|
|
|
|
size_t bh_written;
|
2024-03-20 11:05:47 +00:00
|
|
|
|
2024-06-18 14:21:12 +00:00
|
|
|
bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
|
2024-07-10 18:51:11 +00:00
|
|
|
len, copied, folio, NULL);
|
2024-06-18 14:21:12 +00:00
|
|
|
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
|
|
|
|
return bh_written == copied;
|
|
|
|
}
|
2024-06-03 11:22:22 +00:00
|
|
|
|
2024-06-18 14:21:12 +00:00
|
|
|
return __iomap_write_end(iter->inode, pos, len, copied, folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
2021-08-11 01:33:08 +00:00
|
|
|
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:08 +00:00
|
|
|
loff_t length = iomap_length(iter);
|
|
|
|
loff_t pos = iter->pos;
|
2024-03-20 11:05:46 +00:00
|
|
|
ssize_t total_written = 0;
|
2021-08-11 01:33:08 +00:00
|
|
|
long status = 0;
|
2022-06-23 17:51:48 +00:00
|
|
|
struct address_space *mapping = iter->inode->i_mapping;
|
iomap: fault in smaller chunks for non-large folio mappings
Since commit (5d8edfb900d5 "iomap: Copy larger chunks from userspace"),
iomap will try to copy in larger chunks than PAGE_SIZE. However, if the
mapping doesn't support large folio, only one page of maximum 4KB will
be created and 4KB data will be writen to pagecache each time. Then,
next 4KB will be handled in next iteration. This will cause potential
write performance problem.
If chunk is 2MB, total 512 pages need to be handled finally. During this
period, fault_in_iov_iter_readable() is called to check iov_iter readable
validity. Since only 4KB will be handled each time, below address space
will be checked over and over again:
start end
-
buf, buf+2MB
buf+4KB, buf+2MB
buf+8KB, buf+2MB
...
buf+2044KB buf+2MB
Obviously the checking size is wrong since only 4KB will be handled each
time. So this will get a correct chunk to let iomap work well in non-large
folio case.
With this change, the write speed will be stable. Tested on ARM64 device.
Before:
- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s)
After:
- dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s)
- dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s)
- dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s)
- dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s)
- dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s)
Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace")
Cc: stable@vger.kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Link: https://lore.kernel.org/r/20240521114939.2541461-2-xu.yang_2@nxp.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-05-21 11:49:39 +00:00
|
|
|
size_t chunk = mapping_max_folio_size(mapping);
|
2022-06-23 17:51:48 +00:00
|
|
|
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
do {
|
2021-05-02 15:33:08 +00:00
|
|
|
struct folio *folio;
|
2024-06-18 14:21:12 +00:00
|
|
|
loff_t old_size;
|
2023-05-20 16:13:20 +00:00
|
|
|
size_t offset; /* Offset into folio */
|
|
|
|
size_t bytes; /* Bytes to write to folio */
|
2019-07-15 15:50:59 +00:00
|
|
|
size_t copied; /* Bytes copied from user */
|
2024-03-20 11:05:46 +00:00
|
|
|
size_t written; /* Bytes have been written */
|
2019-07-15 15:50:59 +00:00
|
|
|
|
iomap: fix short copy in iomap_write_iter()
Starting with commit 5d8edfb900d5 ("iomap: Copy larger chunks from
userspace"), iomap_write_iter() can get into endless loop. This can
be reproduced with LTP writev07 which uses partially valid iovecs:
struct iovec wr_iovec[] = {
{ buffer, 64 },
{ bad_addr, 64 },
{ buffer + 64, 64 },
{ buffer + 64 * 2, 64 },
};
commit bc1bb416bbb9 ("generic_perform_write()/iomap_write_actor():
saner logics for short copy") previously introduced the logic, which
made short copy retry in next iteration with amount of "bytes" it
managed to copy:
if (unlikely(status == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (copied)
bytes = copied;
However, since 5d8edfb900d5 "bytes" is no longer carried into next
iteration, because it is now always initialized at the beginning of
the loop. And for iov_iter_count < PAGE_SIZE, "bytes" ends up with
same value as previous iteration, making the loop retry same copy
over and over, which leads to writev07 testcase hanging.
Make next iteration retry with amount of bytes we managed to copy.
Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2023-10-19 16:41:36 +00:00
|
|
|
bytes = iov_iter_count(i);
|
|
|
|
retry:
|
2023-05-20 16:13:20 +00:00
|
|
|
offset = pos & (chunk - 1);
|
iomap: fix short copy in iomap_write_iter()
Starting with commit 5d8edfb900d5 ("iomap: Copy larger chunks from
userspace"), iomap_write_iter() can get into endless loop. This can
be reproduced with LTP writev07 which uses partially valid iovecs:
struct iovec wr_iovec[] = {
{ buffer, 64 },
{ bad_addr, 64 },
{ buffer + 64, 64 },
{ buffer + 64 * 2, 64 },
};
commit bc1bb416bbb9 ("generic_perform_write()/iomap_write_actor():
saner logics for short copy") previously introduced the logic, which
made short copy retry in next iteration with amount of "bytes" it
managed to copy:
if (unlikely(status == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (copied)
bytes = copied;
However, since 5d8edfb900d5 "bytes" is no longer carried into next
iteration, because it is now always initialized at the beginning of
the loop. And for iov_iter_count < PAGE_SIZE, "bytes" ends up with
same value as previous iteration, making the loop retry same copy
over and over, which leads to writev07 testcase hanging.
Make next iteration retry with amount of bytes we managed to copy.
Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2023-10-19 16:41:36 +00:00
|
|
|
bytes = min(chunk - offset, bytes);
|
2022-06-23 17:51:48 +00:00
|
|
|
status = balance_dirty_pages_ratelimited_flags(mapping,
|
|
|
|
bdp_flags);
|
|
|
|
if (unlikely(status))
|
|
|
|
break;
|
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
if (bytes > length)
|
|
|
|
bytes = length;
|
|
|
|
|
|
|
|
/*
|
2021-08-02 21:46:31 +00:00
|
|
|
* Bring in the user page that we'll copy from _first_.
|
2019-07-15 15:50:59 +00:00
|
|
|
* Otherwise there's a nasty deadlock on copying from the
|
|
|
|
* same page as we're writing to, without it being marked
|
|
|
|
* up-to-date.
|
2022-06-23 17:51:48 +00:00
|
|
|
*
|
|
|
|
* For async buffered writes the assumption is that the user
|
|
|
|
* page has already been faulted in. This can be optimized by
|
|
|
|
* faulting the user page.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2021-11-09 11:56:06 +00:00
|
|
|
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
|
2019-07-15 15:50:59 +00:00
|
|
|
status = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-05-02 15:33:08 +00:00
|
|
|
status = iomap_write_begin(iter, pos, bytes, &folio);
|
2024-03-20 11:05:44 +00:00
|
|
|
if (unlikely(status)) {
|
|
|
|
iomap_write_failed(iter->inode, pos, bytes);
|
2019-07-15 15:50:59 +00:00
|
|
|
break;
|
2024-03-20 11:05:44 +00:00
|
|
|
}
|
2022-11-28 22:09:17 +00:00
|
|
|
if (iter->iomap.flags & IOMAP_F_STALE)
|
|
|
|
break;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-05-20 16:13:20 +00:00
|
|
|
offset = offset_in_folio(folio, pos);
|
|
|
|
if (bytes > folio_size(folio) - offset)
|
|
|
|
bytes = folio_size(folio) - offset;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-05-20 16:13:20 +00:00
|
|
|
if (mapping_writably_mapped(mapping))
|
|
|
|
flush_dcache_folio(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-05-20 16:13:20 +00:00
|
|
|
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
|
2024-03-20 11:05:47 +00:00
|
|
|
written = iomap_write_end(iter, pos, bytes, copied, folio) ?
|
|
|
|
copied : 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-06-18 14:21:12 +00:00
|
|
|
/*
|
|
|
|
* Update the in-memory inode size after copying the data into
|
|
|
|
* the page cache. It's up to the file system to write the
|
|
|
|
* updated size to disk, preferably after I/O completion so that
|
|
|
|
* no stale data is exposed. Only once that's done can we
|
|
|
|
* unlock and release the folio.
|
|
|
|
*/
|
|
|
|
old_size = iter->inode->i_size;
|
|
|
|
if (pos + written > old_size) {
|
|
|
|
i_size_write(iter->inode, pos + written);
|
|
|
|
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
|
|
|
|
}
|
|
|
|
__iomap_put_folio(iter, pos, written, folio);
|
|
|
|
|
|
|
|
if (old_size < pos)
|
|
|
|
pagecache_isize_extended(iter->inode, old_size, pos);
|
|
|
|
|
2021-04-30 14:26:41 +00:00
|
|
|
cond_resched();
|
2024-03-20 11:05:46 +00:00
|
|
|
if (unlikely(written == 0)) {
|
2019-07-15 15:50:59 +00:00
|
|
|
/*
|
2021-05-31 04:32:44 +00:00
|
|
|
* A short copy made iomap_write_end() reject the
|
|
|
|
* thing entirely. Might be memory poisoning
|
|
|
|
* halfway through, might be a race with munmap,
|
|
|
|
* might be severe memory pressure.
|
2019-07-15 15:50:59 +00:00
|
|
|
*/
|
2024-03-20 11:05:48 +00:00
|
|
|
iomap_write_failed(iter->inode, pos, bytes);
|
|
|
|
iov_iter_revert(i, copied);
|
|
|
|
|
2023-05-20 16:13:20 +00:00
|
|
|
if (chunk > PAGE_SIZE)
|
|
|
|
chunk /= 2;
|
iomap: fix short copy in iomap_write_iter()
Starting with commit 5d8edfb900d5 ("iomap: Copy larger chunks from
userspace"), iomap_write_iter() can get into endless loop. This can
be reproduced with LTP writev07 which uses partially valid iovecs:
struct iovec wr_iovec[] = {
{ buffer, 64 },
{ bad_addr, 64 },
{ buffer + 64, 64 },
{ buffer + 64 * 2, 64 },
};
commit bc1bb416bbb9 ("generic_perform_write()/iomap_write_actor():
saner logics for short copy") previously introduced the logic, which
made short copy retry in next iteration with amount of "bytes" it
managed to copy:
if (unlikely(status == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (copied)
bytes = copied;
However, since 5d8edfb900d5 "bytes" is no longer carried into next
iteration, because it is now always initialized at the beginning of
the loop. And for iov_iter_count < PAGE_SIZE, "bytes" ends up with
same value as previous iteration, making the loop retry same copy
over and over, which leads to writev07 testcase hanging.
Make next iteration retry with amount of bytes we managed to copy.
Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace")
Signed-off-by: Jan Stancek <jstancek@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2023-10-19 16:41:36 +00:00
|
|
|
if (copied) {
|
|
|
|
bytes = copied;
|
|
|
|
goto retry;
|
|
|
|
}
|
2023-05-20 16:13:20 +00:00
|
|
|
} else {
|
2024-03-20 11:05:46 +00:00
|
|
|
pos += written;
|
|
|
|
total_written += written;
|
|
|
|
length -= written;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
} while (iov_iter_count(i) && length);
|
|
|
|
|
2022-06-23 17:51:49 +00:00
|
|
|
if (status == -EAGAIN) {
|
2024-03-20 11:05:46 +00:00
|
|
|
iov_iter_revert(i, total_written);
|
2022-06-23 17:51:49 +00:00
|
|
|
return -EAGAIN;
|
|
|
|
}
|
2024-03-20 11:05:46 +00:00
|
|
|
return total_written ? total_written : status;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t
|
2021-08-11 01:33:08 +00:00
|
|
|
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
|
2024-08-27 10:51:36 +00:00
|
|
|
const struct iomap_ops *ops, void *private)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:08 +00:00
|
|
|
struct iomap_iter iter = {
|
|
|
|
.inode = iocb->ki_filp->f_mapping->host,
|
|
|
|
.pos = iocb->ki_pos,
|
|
|
|
.len = iov_iter_count(i),
|
|
|
|
.flags = IOMAP_WRITE,
|
2024-08-27 10:51:36 +00:00
|
|
|
.private = private,
|
2021-08-11 01:33:08 +00:00
|
|
|
};
|
2023-06-01 14:58:59 +00:00
|
|
|
ssize_t ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-06-23 17:51:48 +00:00
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT)
|
|
|
|
iter.flags |= IOMAP_NOWAIT;
|
|
|
|
|
2021-08-11 01:33:08 +00:00
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
|
|
|
iter.processed = iomap_write_iter(&iter, i);
|
2023-06-01 14:58:59 +00:00
|
|
|
|
2023-07-17 15:49:57 +00:00
|
|
|
if (unlikely(iter.pos == iocb->ki_pos))
|
2021-08-11 01:33:08 +00:00
|
|
|
return ret;
|
2023-06-01 14:58:59 +00:00
|
|
|
ret = iter.pos - iocb->ki_pos;
|
2023-07-17 15:49:57 +00:00
|
|
|
iocb->ki_pos = iter.pos;
|
2023-06-01 14:58:59 +00:00
|
|
|
return ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
|
|
|
|
|
2024-09-10 04:39:07 +00:00
|
|
|
static void iomap_write_delalloc_ifs_punch(struct inode *inode,
|
2023-07-10 21:12:43 +00:00
|
|
|
struct folio *folio, loff_t start_byte, loff_t end_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
struct iomap *iomap, iomap_punch_t punch)
|
2023-07-10 21:12:43 +00:00
|
|
|
{
|
|
|
|
unsigned int first_blk, last_blk, i;
|
|
|
|
loff_t last_byte;
|
|
|
|
u8 blkbits = inode->i_blkbits;
|
|
|
|
struct iomap_folio_state *ifs;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When we have per-block dirty tracking, there can be
|
|
|
|
* blocks within a folio which are marked uptodate
|
|
|
|
* but not dirty. In that case it is necessary to punch
|
|
|
|
* out such blocks to avoid leaking any delalloc blocks.
|
|
|
|
*/
|
|
|
|
ifs = folio->private;
|
|
|
|
if (!ifs)
|
2024-09-10 04:39:07 +00:00
|
|
|
return;
|
2023-07-10 21:12:43 +00:00
|
|
|
|
|
|
|
last_byte = min_t(loff_t, end_byte - 1,
|
|
|
|
folio_pos(folio) + folio_size(folio) - 1);
|
|
|
|
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
|
|
|
|
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
|
|
|
|
for (i = first_blk; i <= last_blk; i++) {
|
2024-09-10 04:39:07 +00:00
|
|
|
if (!ifs_block_is_dirty(folio, ifs, i))
|
|
|
|
punch(inode, folio_pos(folio) + (i << blkbits),
|
2024-09-10 04:39:06 +00:00
|
|
|
1 << blkbits, iomap);
|
2023-07-10 21:12:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-10 04:39:07 +00:00
|
|
|
static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
|
2023-07-10 21:12:24 +00:00
|
|
|
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
struct iomap *iomap, iomap_punch_t punch)
|
2023-07-10 21:12:24 +00:00
|
|
|
{
|
|
|
|
if (!folio_test_dirty(folio))
|
2024-09-10 04:39:07 +00:00
|
|
|
return;
|
2023-07-10 21:12:24 +00:00
|
|
|
|
|
|
|
/* if dirty, punch up to offset */
|
|
|
|
if (start_byte > *punch_start_byte) {
|
2024-09-10 04:39:07 +00:00
|
|
|
punch(inode, *punch_start_byte, start_byte - *punch_start_byte,
|
|
|
|
iomap);
|
2023-07-10 21:12:24 +00:00
|
|
|
}
|
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
/* Punch non-dirty blocks within folio */
|
2024-09-10 04:39:07 +00:00
|
|
|
iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
iomap, punch);
|
2023-07-10 21:12:43 +00:00
|
|
|
|
2023-07-10 21:12:24 +00:00
|
|
|
/*
|
|
|
|
* Make sure the next punch start is correctly bound to
|
|
|
|
* the end of this data range, not the end of the folio.
|
|
|
|
*/
|
|
|
|
*punch_start_byte = min_t(loff_t, end_byte,
|
|
|
|
folio_pos(folio) + folio_size(folio));
|
|
|
|
}
|
|
|
|
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
/*
|
|
|
|
* Scan the data range passed to us for dirty page cache folios. If we find a
|
2023-09-28 16:26:58 +00:00
|
|
|
* dirty folio, punch out the preceding range and update the offset from which
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
* the next punch will start from.
|
|
|
|
*
|
|
|
|
* We can punch out storage reservations under clean pages because they either
|
|
|
|
* contain data that has been written back - in which case the delalloc punch
|
|
|
|
* over that range is a no-op - or they have been read faults in which case they
|
|
|
|
* contain zeroes and we can remove the delalloc backing range and any new
|
|
|
|
* writes to those pages will do the normal hole filling operation...
|
|
|
|
*
|
|
|
|
* This makes the logic simple: we only need to keep the delalloc extents only
|
|
|
|
* over the dirty ranges of the page cache.
|
|
|
|
*
|
|
|
|
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
|
|
|
|
* simplify range iterations.
|
|
|
|
*/
|
2024-09-10 04:39:07 +00:00
|
|
|
static void iomap_write_delalloc_scan(struct inode *inode,
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
struct iomap *iomap, iomap_punch_t punch)
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
{
|
|
|
|
while (start_byte < end_byte) {
|
|
|
|
struct folio *folio;
|
|
|
|
|
|
|
|
/* grab locked page */
|
|
|
|
folio = filemap_lock_folio(inode->i_mapping,
|
|
|
|
start_byte >> PAGE_SHIFT);
|
2023-03-07 14:34:10 +00:00
|
|
|
if (IS_ERR(folio)) {
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
|
|
|
|
PAGE_SIZE;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-09-10 04:39:07 +00:00
|
|
|
iomap_write_delalloc_punch(inode, folio, punch_start_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
start_byte, end_byte, iomap, punch);
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
|
|
|
|
/* move offset to start of next folio in range */
|
|
|
|
start_byte = folio_next_index(folio) << PAGE_SHIFT;
|
|
|
|
folio_unlock(folio);
|
|
|
|
folio_put(folio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2024-10-08 08:59:13 +00:00
|
|
|
* When a short write occurs, the filesystem might need to use ->iomap_end
|
|
|
|
* to remove space reservations created in ->iomap_begin.
|
|
|
|
*
|
|
|
|
* For filesystems that use delayed allocation, there can be dirty pages over
|
|
|
|
* the delalloc extent outside the range of a short write but still within the
|
|
|
|
* delalloc extent allocated for this iomap if the write raced with page
|
|
|
|
* faults.
|
|
|
|
*
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
* Punch out all the delalloc blocks in the range given except for those that
|
|
|
|
* have dirty data still pending in the page cache - those are going to be
|
|
|
|
* written and so must still retain the delalloc backing for writeback.
|
|
|
|
*
|
2024-10-08 08:59:13 +00:00
|
|
|
* The punch() callback *must* only punch delalloc extents in the range passed
|
|
|
|
* to it. It must skip over all other types of extents in the range and leave
|
|
|
|
* them completely unchanged. It must do this punch atomically with respect to
|
|
|
|
* other extent modifications.
|
|
|
|
*
|
|
|
|
* The punch() callback may be called with a folio locked to prevent writeback
|
|
|
|
* extent allocation racing at the edge of the range we are currently punching.
|
|
|
|
* The locked folio may or may not cover the range being punched, so it is not
|
|
|
|
* safe for the punch() callback to lock folios itself.
|
|
|
|
*
|
|
|
|
* Lock order is:
|
|
|
|
*
|
|
|
|
* inode->i_rwsem (shared or exclusive)
|
|
|
|
* inode->i_mapping->invalidate_lock (exclusive)
|
|
|
|
* folio_lock()
|
|
|
|
* ->punch
|
|
|
|
* internal filesystem allocation lock
|
|
|
|
*
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
* As we are scanning the page cache for data, we don't need to reimplement the
|
|
|
|
* wheel - mapping_seek_hole_data() does exactly what we need to identify the
|
|
|
|
* start and end of data ranges correctly even for sub-folio block sizes. This
|
|
|
|
* byte range based iteration is especially convenient because it means we
|
|
|
|
* don't have to care about variable size folios, nor where the start or end of
|
|
|
|
* the data range lies within a folio, if they lie within the same folio or even
|
|
|
|
* if there are multiple discontiguous data ranges within the folio.
|
|
|
|
*
|
|
|
|
* It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
|
|
|
|
* can return data ranges that exist in the cache beyond EOF. e.g. a page fault
|
|
|
|
* spanning EOF will initialise the post-EOF data to zeroes and mark it up to
|
|
|
|
* date. A write page fault can then mark it dirty. If we then fail a write()
|
|
|
|
* beyond EOF into that up to date cached range, we allocate a delalloc block
|
|
|
|
* beyond EOF and then have to punch it out. Because the range is up to date,
|
|
|
|
* mapping_seek_hole_data() will return it, and we will skip the punch because
|
|
|
|
* the folio is dirty. THis is incorrect - we always need to punch out delalloc
|
|
|
|
* beyond EOF in this case as writeback will never write back and covert that
|
|
|
|
* delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
|
|
|
|
* resulting in always punching out the range from the EOF to the end of the
|
|
|
|
* range the iomap spans.
|
|
|
|
*
|
|
|
|
* Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
|
|
|
|
* matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
|
|
|
|
* returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
|
|
|
|
* returns the end of the data range (data_end). Using closed intervals would
|
|
|
|
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
|
|
|
|
* the code to subtle off-by-one bugs....
|
|
|
|
*/
|
2024-10-08 08:59:13 +00:00
|
|
|
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
|
2024-09-10 04:39:06 +00:00
|
|
|
loff_t end_byte, unsigned flags, struct iomap *iomap,
|
|
|
|
iomap_punch_t punch)
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
{
|
|
|
|
loff_t punch_start_byte = start_byte;
|
|
|
|
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
|
|
|
|
|
|
|
|
/*
|
2024-10-08 08:59:14 +00:00
|
|
|
* The caller must hold invalidate_lock to avoid races with page faults
|
|
|
|
* re-instantiating folios and dirtying them via ->page_mkwrite whilst
|
|
|
|
* we walk the cache and perform delalloc extent removal. Failing to do
|
|
|
|
* this can leave dirty pages with no space reservation in the cache.
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
*/
|
2024-10-08 08:59:14 +00:00
|
|
|
lockdep_assert_held_write(&inode->i_mapping->invalidate_lock);
|
|
|
|
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
while (start_byte < scan_end_byte) {
|
|
|
|
loff_t data_end;
|
|
|
|
|
|
|
|
start_byte = mapping_seek_hole_data(inode->i_mapping,
|
|
|
|
start_byte, scan_end_byte, SEEK_DATA);
|
|
|
|
/*
|
|
|
|
* If there is no more data to scan, all that is left is to
|
|
|
|
* punch out the remaining range.
|
2024-09-10 04:39:07 +00:00
|
|
|
*
|
|
|
|
* Note that mapping_seek_hole_data is only supposed to return
|
|
|
|
* either an offset or -ENXIO, so WARN on any other error as
|
|
|
|
* that would be an API change without updating the callers.
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
*/
|
|
|
|
if (start_byte == -ENXIO || start_byte == scan_end_byte)
|
|
|
|
break;
|
2024-09-10 04:39:07 +00:00
|
|
|
if (WARN_ON_ONCE(start_byte < 0))
|
2024-10-08 08:59:14 +00:00
|
|
|
return;
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
WARN_ON_ONCE(start_byte < punch_start_byte);
|
|
|
|
WARN_ON_ONCE(start_byte > scan_end_byte);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We find the end of this contiguous cached data range by
|
|
|
|
* seeking from start_byte to the beginning of the next hole.
|
|
|
|
*/
|
|
|
|
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
|
|
|
|
scan_end_byte, SEEK_HOLE);
|
2024-09-10 04:39:07 +00:00
|
|
|
if (WARN_ON_ONCE(data_end < 0))
|
2024-10-08 08:59:14 +00:00
|
|
|
return;
|
2024-09-10 04:39:03 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we race with post-direct I/O invalidation of the page cache,
|
|
|
|
* there might be no data left at start_byte.
|
|
|
|
*/
|
|
|
|
if (data_end == start_byte)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(data_end < start_byte);
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
WARN_ON_ONCE(data_end > scan_end_byte);
|
|
|
|
|
2024-09-10 04:39:07 +00:00
|
|
|
iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte,
|
|
|
|
data_end, iomap, punch);
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
|
|
|
|
/* The next data search starts at the end of this one. */
|
|
|
|
start_byte = data_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (punch_start_byte < end_byte)
|
2024-09-10 04:39:07 +00:00
|
|
|
punch(inode, punch_start_byte, end_byte - punch_start_byte,
|
|
|
|
iomap);
|
iomap: buffered write failure should not truncate the page cache
iomap_file_buffered_write_punch_delalloc() currently invalidates the
page cache over the unused range of the delalloc extent that was
allocated. While the write allocated the delalloc extent, it does
not own it exclusively as the write does not hold any locks that
prevent either writeback or mmap page faults from changing the state
of either the page cache or the extent state backing this range.
Whilst xfs_bmap_punch_delalloc_range() already handles races in
extent conversion - it will only punch out delalloc extents and it
ignores any other type of extent - the page cache truncate does not
discriminate between data written by this write or some other task.
As a result, truncating the page cache can result in data corruption
if the write races with mmap modifications to the file over the same
range.
generic/346 exercises this workload, and if we randomly fail writes
(as will happen when iomap gets stale iomap detection later in the
patchset), it will randomly corrupt the file data because it removes
data written by mmap() in the same page as the write() that failed.
Hence we do not want to punch out the page cache over the range of
the extent we failed to write to - what we actually need to do is
detect the ranges that have dirty data in cache over them and *not
punch them out*.
To do this, we have to walk the page cache over the range of the
delalloc extent we want to remove. This is made complex by the fact
we have to handle partially up-to-date folios correctly and this can
happen even when the FSB size == PAGE_SIZE because we now support
multi-page folios in the page cache.
Because we are only interested in discovering the edges of data
ranges in the page cache (i.e. hole-data boundaries) we can make use
of mapping_seek_hole_data() to find those transitions in the page
cache. As we hold the invalidate_lock, we know that the boundaries
are not going to change while we walk the range. This interface is
also byte-based and is sub-page block aware, so we can find the data
ranges in the cache based on byte offsets rather than page, folio or
fs block sized chunks. This greatly simplifies the logic of finding
dirty cached ranges in the page cache.
Once we've identified a range that contains cached data, we can then
iterate the range folio by folio. This allows us to determine if the
data is dirty and hence perform the correct delalloc extent punching
operations. The seek interface we use to iterate data ranges will
give us sub-folio start/end granularity, so we may end up looking up
the same folio multiple times as the seek interface iterates across
each discontiguous data region in the folio.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-28 22:09:11 +00:00
|
|
|
}
|
2024-10-08 08:59:13 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
|
2022-11-23 01:44:38 +00:00
|
|
|
|
2021-08-11 01:33:09 +00:00
|
|
|
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
struct iomap *iomap = &iter->iomap;
|
|
|
|
loff_t pos = iter->pos;
|
|
|
|
loff_t length = iomap_length(iter);
|
2020-06-09 03:58:29 +00:00
|
|
|
loff_t written = 0;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-10-03 15:09:16 +00:00
|
|
|
if (!iomap_want_unshare_iter(iter))
|
2019-10-18 23:41:34 +00:00
|
|
|
return length;
|
|
|
|
|
2019-07-15 15:50:59 +00:00
|
|
|
do {
|
2021-05-02 15:33:08 +00:00
|
|
|
struct folio *folio;
|
2023-09-18 22:57:40 +00:00
|
|
|
int status;
|
|
|
|
size_t offset;
|
|
|
|
size_t bytes = min_t(u64, SIZE_MAX, length);
|
2024-03-20 11:05:47 +00:00
|
|
|
bool ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-05-02 15:33:08 +00:00
|
|
|
status = iomap_write_begin(iter, pos, bytes, &folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
if (unlikely(status))
|
|
|
|
return status;
|
2023-09-18 22:57:40 +00:00
|
|
|
if (iomap->flags & IOMAP_F_STALE)
|
2022-11-28 22:09:17 +00:00
|
|
|
break;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2023-09-18 22:57:40 +00:00
|
|
|
offset = offset_in_folio(folio, pos);
|
|
|
|
if (bytes > folio_size(folio) - offset)
|
|
|
|
bytes = folio_size(folio) - offset;
|
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
|
2024-06-18 14:21:12 +00:00
|
|
|
__iomap_put_folio(iter, pos, bytes, folio);
|
2024-03-20 11:05:47 +00:00
|
|
|
if (WARN_ON_ONCE(!ret))
|
2020-09-21 15:58:41 +00:00
|
|
|
return -EIO;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
2023-09-18 22:57:40 +00:00
|
|
|
pos += bytes;
|
|
|
|
written += bytes;
|
|
|
|
length -= bytes;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-08-11 01:33:09 +00:00
|
|
|
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
|
2023-09-18 22:57:40 +00:00
|
|
|
} while (length > 0);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
return written;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2019-10-18 23:41:34 +00:00
|
|
|
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
2019-07-15 15:50:59 +00:00
|
|
|
const struct iomap_ops *ops)
|
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
struct iomap_iter iter = {
|
|
|
|
.inode = inode,
|
|
|
|
.pos = pos,
|
2021-08-11 01:33:14 +00:00
|
|
|
.flags = IOMAP_WRITE | IOMAP_UNSHARE,
|
2021-08-11 01:33:09 +00:00
|
|
|
};
|
2024-10-02 15:02:13 +00:00
|
|
|
loff_t size = i_size_read(inode);
|
2021-08-11 01:33:09 +00:00
|
|
|
int ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-10-02 15:02:13 +00:00
|
|
|
if (pos < 0 || pos >= size)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
iter.len = min(len, size - pos);
|
2021-08-11 01:33:09 +00:00
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
|
|
|
iter.processed = iomap_unshare_iter(&iter);
|
|
|
|
return ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
2019-10-18 23:41:34 +00:00
|
|
|
EXPORT_SYMBOL_GPL(iomap_file_unshare);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-08-30 14:56:34 +00:00
|
|
|
/*
|
|
|
|
* Flush the remaining range of the iter and mark the current mapping stale.
|
|
|
|
* This is used when zero range sees an unwritten mapping that may have had
|
|
|
|
* dirty pagecache over it.
|
|
|
|
*/
|
|
|
|
static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = i->inode->i_mapping;
|
|
|
|
loff_t end = i->pos + i->len - 1;
|
|
|
|
|
|
|
|
i->iomap.flags |= IOMAP_F_STALE;
|
|
|
|
return filemap_write_and_wait_range(mapping, i->pos, end);
|
|
|
|
}
|
|
|
|
|
2024-11-15 20:01:54 +00:00
|
|
|
static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
loff_t pos = iter->pos;
|
|
|
|
loff_t length = iomap_length(iter);
|
2019-07-15 15:50:59 +00:00
|
|
|
loff_t written = 0;
|
|
|
|
|
|
|
|
do {
|
2021-12-21 00:03:46 +00:00
|
|
|
struct folio *folio;
|
|
|
|
int status;
|
|
|
|
size_t offset;
|
|
|
|
size_t bytes = min_t(u64, SIZE_MAX, length);
|
2024-03-20 11:05:47 +00:00
|
|
|
bool ret;
|
2021-12-21 00:03:46 +00:00
|
|
|
|
|
|
|
status = iomap_write_begin(iter, pos, bytes, &folio);
|
|
|
|
if (status)
|
|
|
|
return status;
|
2022-11-28 22:09:17 +00:00
|
|
|
if (iter->iomap.flags & IOMAP_F_STALE)
|
|
|
|
break;
|
2021-12-21 00:03:46 +00:00
|
|
|
|
iomap: warn on zero range of a post-eof folio
iomap_zero_range() uses buffered writes for manual zeroing, no
longer updates i_size for such writes, but is still explicitly
called for post-eof ranges. The historical use case for this is
zeroing post-eof speculative preallocation on extending writes from
XFS. However, XFS also recently changed to convert all post-eof
delalloc mappings to unwritten in the iomap_begin() handler, which
means it now never expects manual zeroing of post-eof mappings. In
other words, all post-eof mappings should be reported as holes or
unwritten.
This is a subtle dependency that can be hard to detect if violated
because associated codepaths are likely to update i_size after folio
locks are dropped, but before writeback happens to occur. For
example, if XFS reverts back to some form of manual zeroing of
post-eof blocks on write extension, writeback of those zeroed folios
will now race with the presumed i_size update from the subsequent
buffered write.
Since iomap_zero_range() can't correctly zero post-eof mappings
beyond EOF without updating i_size, warn if this ever occurs. This
serves as minimal indication that if this use case is reintroduced
by a filesystem, iomap_zero_range() might need to reconsider i_size
updates for write extending use cases.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Link: https://lore.kernel.org/r/20241115145931.535207-1-bfoster@redhat.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-11-15 14:59:31 +00:00
|
|
|
/* warn about zeroing folios beyond eof that won't write back */
|
|
|
|
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
|
2021-12-21 00:03:46 +00:00
|
|
|
offset = offset_in_folio(folio, pos);
|
|
|
|
if (bytes > folio_size(folio) - offset)
|
|
|
|
bytes = folio_size(folio) - offset;
|
|
|
|
|
|
|
|
folio_zero_range(folio, offset, bytes);
|
|
|
|
folio_mark_accessed(folio);
|
|
|
|
|
2024-03-20 11:05:47 +00:00
|
|
|
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
|
2024-06-18 14:21:12 +00:00
|
|
|
__iomap_put_folio(iter, pos, bytes, folio);
|
2024-03-20 11:05:47 +00:00
|
|
|
if (WARN_ON_ONCE(!ret))
|
2021-12-21 00:03:46 +00:00
|
|
|
return -EIO;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
|
|
|
pos += bytes;
|
2020-09-21 15:58:42 +00:00
|
|
|
length -= bytes;
|
2019-07-15 15:50:59 +00:00
|
|
|
written += bytes;
|
2020-09-21 15:58:42 +00:00
|
|
|
} while (length > 0);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2022-06-30 17:04:18 +00:00
|
|
|
if (did_zero)
|
|
|
|
*did_zero = true;
|
2019-07-15 15:50:59 +00:00
|
|
|
return written;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
|
|
|
const struct iomap_ops *ops)
|
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
struct iomap_iter iter = {
|
|
|
|
.inode = inode,
|
|
|
|
.pos = pos,
|
|
|
|
.len = len,
|
|
|
|
.flags = IOMAP_ZERO,
|
|
|
|
};
|
2024-11-15 20:01:55 +00:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
unsigned int blocksize = i_blocksize(inode);
|
|
|
|
unsigned int off = pos & (blocksize - 1);
|
|
|
|
loff_t plen = min_t(loff_t, len, blocksize - off);
|
2021-08-11 01:33:09 +00:00
|
|
|
int ret;
|
2024-08-30 14:56:34 +00:00
|
|
|
bool range_dirty;
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-08-30 14:56:33 +00:00
|
|
|
/*
|
2024-11-15 20:01:54 +00:00
|
|
|
* Zero range can skip mappings that are zero on disk so long as
|
|
|
|
* pagecache is clean. If pagecache was dirty prior to zero range, the
|
|
|
|
* mapping converts on writeback completion and so must be zeroed.
|
2024-08-30 14:56:34 +00:00
|
|
|
*
|
2024-11-15 20:01:54 +00:00
|
|
|
* The simplest way to deal with this across a range is to flush
|
2024-11-15 20:01:55 +00:00
|
|
|
* pagecache and process the updated mappings. To avoid excessive
|
|
|
|
* flushing on partial eof zeroing, special case it to zero the
|
|
|
|
* unaligned start portion if already dirty in pagecache.
|
|
|
|
*/
|
|
|
|
if (off &&
|
|
|
|
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
|
|
|
|
iter.len = plen;
|
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
|
|
|
iter.processed = iomap_zero_iter(&iter, did_zero);
|
|
|
|
|
|
|
|
iter.len = len - (iter.pos - pos);
|
|
|
|
if (ret || !iter.len)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* To avoid an unconditional flush, check pagecache state and only flush
|
|
|
|
* if dirty and the fs returns a mapping that might convert on
|
|
|
|
* writeback.
|
2024-08-30 14:56:33 +00:00
|
|
|
*/
|
2024-08-30 14:56:34 +00:00
|
|
|
range_dirty = filemap_range_needs_writeback(inode->i_mapping,
|
2024-11-15 20:01:55 +00:00
|
|
|
iter.pos, iter.pos + iter.len - 1);
|
2024-11-15 20:01:54 +00:00
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0) {
|
|
|
|
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2024-11-15 20:01:54 +00:00
|
|
|
if (srcmap->type == IOMAP_HOLE ||
|
|
|
|
srcmap->type == IOMAP_UNWRITTEN) {
|
|
|
|
loff_t proc = iomap_length(&iter);
|
|
|
|
|
|
|
|
if (range_dirty) {
|
|
|
|
range_dirty = false;
|
|
|
|
proc = iomap_zero_iter_flush_and_stale(&iter);
|
|
|
|
}
|
|
|
|
iter.processed = proc;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
iter.processed = iomap_zero_iter(&iter, did_zero);
|
|
|
|
}
|
2021-08-11 01:33:09 +00:00
|
|
|
return ret;
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_zero_range);
|
|
|
|
|
|
|
|
int
|
|
|
|
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
|
|
|
|
const struct iomap_ops *ops)
|
|
|
|
{
|
|
|
|
unsigned int blocksize = i_blocksize(inode);
|
|
|
|
unsigned int off = pos & (blocksize - 1);
|
|
|
|
|
|
|
|
/* Block boundary? Nothing to do */
|
|
|
|
if (!off)
|
|
|
|
return 0;
|
|
|
|
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_truncate_page);
|
|
|
|
|
2021-04-29 02:32:02 +00:00
|
|
|
static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
|
|
|
|
struct folio *folio)
|
2019-07-15 15:50:59 +00:00
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
loff_t length = iomap_length(iter);
|
2019-07-15 15:50:59 +00:00
|
|
|
int ret;
|
|
|
|
|
2021-08-11 01:33:09 +00:00
|
|
|
if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) {
|
2021-11-03 18:05:47 +00:00
|
|
|
ret = __block_write_begin_int(folio, iter->pos, length, NULL,
|
2021-08-11 01:33:09 +00:00
|
|
|
&iter->iomap);
|
2019-07-15 15:50:59 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2021-04-29 02:32:02 +00:00
|
|
|
block_commit_write(&folio->page, 0, length);
|
2019-07-15 15:50:59 +00:00
|
|
|
} else {
|
2021-04-29 02:32:02 +00:00
|
|
|
WARN_ON_ONCE(!folio_test_uptodate(folio));
|
|
|
|
folio_mark_dirty(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return length;
|
|
|
|
}
|
|
|
|
|
|
|
|
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
|
|
|
|
{
|
2021-08-11 01:33:09 +00:00
|
|
|
struct iomap_iter iter = {
|
|
|
|
.inode = file_inode(vmf->vma->vm_file),
|
|
|
|
.flags = IOMAP_WRITE | IOMAP_FAULT,
|
|
|
|
};
|
2021-04-29 02:32:02 +00:00
|
|
|
struct folio *folio = page_folio(vmf->page);
|
2019-07-15 15:50:59 +00:00
|
|
|
ssize_t ret;
|
|
|
|
|
2021-04-29 02:32:02 +00:00
|
|
|
folio_lock(folio);
|
|
|
|
ret = folio_mkwrite_check_truncate(folio, iter.inode);
|
2020-01-06 16:58:23 +00:00
|
|
|
if (ret < 0)
|
2019-07-15 15:50:59 +00:00
|
|
|
goto out_unlock;
|
2021-04-29 02:32:02 +00:00
|
|
|
iter.pos = folio_pos(folio);
|
2021-08-11 01:33:09 +00:00
|
|
|
iter.len = ret;
|
|
|
|
while ((ret = iomap_iter(&iter, ops)) > 0)
|
2021-04-29 02:32:02 +00:00
|
|
|
iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
|
2021-08-11 01:33:09 +00:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_unlock;
|
2021-04-29 02:32:02 +00:00
|
|
|
folio_wait_stable(folio);
|
2019-07-15 15:50:59 +00:00
|
|
|
return VM_FAULT_LOCKED;
|
|
|
|
out_unlock:
|
2021-04-29 02:32:02 +00:00
|
|
|
folio_unlock(folio);
|
2023-08-01 17:21:57 +00:00
|
|
|
return vmf_fs_error(ret);
|
2019-07-15 15:50:59 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2021-01-01 21:53:26 +00:00
|
|
|
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
|
2023-12-07 07:27:06 +00:00
|
|
|
size_t len)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2023-07-10 21:11:19 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
|
|
|
|
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
|
2021-01-01 21:53:26 +00:00
|
|
|
folio_end_writeback(folio);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're now finished for good with this ioend structure. Update the page
|
|
|
|
* state, release holds on bios, and finally free up memory. Do not use the
|
|
|
|
* ioend after this.
|
|
|
|
*/
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
static u32
|
2019-10-17 20:12:15 +00:00
|
|
|
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
|
|
|
{
|
|
|
|
struct inode *inode = ioend->io_inode;
|
2023-12-07 07:27:05 +00:00
|
|
|
struct bio *bio = &ioend->io_bio;
|
|
|
|
struct folio_iter fi;
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
u32 folio_count = 0;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-12-07 07:27:06 +00:00
|
|
|
if (error) {
|
|
|
|
mapping_set_error(inode->i_mapping, error);
|
|
|
|
if (!bio_flagged(bio, BIO_QUIET)) {
|
|
|
|
pr_err_ratelimited(
|
|
|
|
"%s: writeback error on inode %lu, offset %lld, sector %llu",
|
|
|
|
inode->i_sb->s_id, inode->i_ino,
|
|
|
|
ioend->io_offset, ioend->io_sector);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:05 +00:00
|
|
|
/* walk all folios in bio, ending page IO on them */
|
|
|
|
bio_for_each_folio_all(fi, bio) {
|
2023-12-07 07:27:06 +00:00
|
|
|
iomap_finish_folio_write(inode, fi.folio, fi.length);
|
2023-12-07 07:27:05 +00:00
|
|
|
folio_count++;
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:05 +00:00
|
|
|
bio_put(bio); /* frees the ioend */
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
return folio_count;
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
/*
|
|
|
|
* Ioend completion routine for merged bios. This can only be called from task
|
|
|
|
* contexts as merged ioends can be of unbound length. Hence we have to break up
|
|
|
|
* the writeback completions into manageable chunks to avoid long scheduler
|
|
|
|
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
|
|
|
|
* good batch processing throughput without creating adverse scheduler latency
|
|
|
|
* conditions.
|
|
|
|
*/
|
2019-10-17 20:12:15 +00:00
|
|
|
void
|
|
|
|
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
|
|
|
|
{
|
|
|
|
struct list_head tmp;
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
u32 completions;
|
|
|
|
|
|
|
|
might_sleep();
|
2019-10-17 20:12:15 +00:00
|
|
|
|
|
|
|
list_replace_init(&ioend->io_list, &tmp);
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
completions = iomap_finish_ioend(ioend, error);
|
2019-10-17 20:12:15 +00:00
|
|
|
|
|
|
|
while (!list_empty(&tmp)) {
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
if (completions > IOEND_BATCH_SIZE * 8) {
|
|
|
|
cond_resched();
|
|
|
|
completions = 0;
|
|
|
|
}
|
2019-10-17 20:12:15 +00:00
|
|
|
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
|
|
|
|
list_del_init(&ioend->io_list);
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
completions += iomap_finish_ioend(ioend, error);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can merge two adjacent ioends if they have the same set of work to do.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
|
|
|
|
{
|
2023-12-07 07:27:05 +00:00
|
|
|
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
|
2019-10-17 20:12:15 +00:00
|
|
|
return false;
|
2024-11-04 04:19:16 +00:00
|
|
|
if (next->io_flags & IOMAP_F_BOUNDARY)
|
|
|
|
return false;
|
2019-10-17 20:12:15 +00:00
|
|
|
if ((ioend->io_flags & IOMAP_F_SHARED) ^
|
|
|
|
(next->io_flags & IOMAP_F_SHARED))
|
|
|
|
return false;
|
|
|
|
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
|
|
|
|
(next->io_type == IOMAP_UNWRITTEN))
|
|
|
|
return false;
|
|
|
|
if (ioend->io_offset + ioend->io_size != next->io_offset)
|
|
|
|
return false;
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
/*
|
|
|
|
* Do not merge physically discontiguous ioends. The filesystem
|
|
|
|
* completion functions will have to iterate the physical
|
|
|
|
* discontiguities even if we merge the ioends at a logical level, so
|
|
|
|
* we don't gain anything by merging physical discontiguities here.
|
|
|
|
*
|
|
|
|
* We cannot use bio->bi_iter.bi_sector here as it is modified during
|
|
|
|
* submission so does not point to the start sector of the bio at
|
|
|
|
* completion.
|
|
|
|
*/
|
|
|
|
if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
|
|
|
|
return false;
|
2019-10-17 20:12:15 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2021-05-04 15:54:29 +00:00
|
|
|
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
|
|
|
struct iomap_ioend *next;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&ioend->io_list);
|
|
|
|
|
|
|
|
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
|
|
|
|
io_list))) {
|
|
|
|
if (!iomap_ioend_can_merge(ioend, next))
|
|
|
|
break;
|
|
|
|
list_move_tail(&next->io_list, &ioend->io_list);
|
|
|
|
ioend->io_size += next->io_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
|
|
|
|
|
|
|
|
static int
|
2021-04-08 18:28:34 +00:00
|
|
|
iomap_ioend_compare(void *priv, const struct list_head *a,
|
|
|
|
const struct list_head *b)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2019-10-17 20:12:20 +00:00
|
|
|
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
|
|
|
|
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
|
2019-10-17 20:12:15 +00:00
|
|
|
|
|
|
|
if (ia->io_offset < ib->io_offset)
|
|
|
|
return -1;
|
2019-10-17 20:12:20 +00:00
|
|
|
if (ia->io_offset > ib->io_offset)
|
2019-10-17 20:12:15 +00:00
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iomap_sort_ioends(struct list_head *ioend_list)
|
|
|
|
{
|
|
|
|
list_sort(NULL, ioend_list, iomap_ioend_compare);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
|
|
|
|
|
|
|
|
static void iomap_writepage_end_bio(struct bio *bio)
|
|
|
|
{
|
2023-12-07 07:27:05 +00:00
|
|
|
iomap_finish_ioend(iomap_ioend_from_bio(bio),
|
|
|
|
blk_status_to_errno(bio->bi_status));
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Submit the final bio for an ioend.
|
|
|
|
*
|
|
|
|
* If @error is non-zero, it means that we have a situation where some part of
|
2023-12-07 07:27:08 +00:00
|
|
|
* the submission process has failed after we've marked pages for writeback.
|
|
|
|
* We cannot cancel ioend directly in that case, so call the bio end I/O handler
|
|
|
|
* with the error status here to run the normal I/O completion handler to clear
|
|
|
|
* the writeback bit and let the file system proess the errors.
|
2019-10-17 20:12:15 +00:00
|
|
|
*/
|
2023-12-07 07:27:08 +00:00
|
|
|
static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2023-12-07 07:27:08 +00:00
|
|
|
if (!wpc->ioend)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Let the file systems prepare the I/O submission and hook in an I/O
|
|
|
|
* comletion handler. This also needs to happen in case after a
|
|
|
|
* failure happened so that the file system end I/O handler gets called
|
|
|
|
* to clean up.
|
|
|
|
*/
|
2019-10-17 20:12:15 +00:00
|
|
|
if (wpc->ops->prepare_ioend)
|
2023-12-07 07:27:08 +00:00
|
|
|
error = wpc->ops->prepare_ioend(wpc->ioend, error);
|
|
|
|
|
2019-10-17 20:12:15 +00:00
|
|
|
if (error) {
|
2023-12-07 07:27:08 +00:00
|
|
|
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
|
|
|
|
bio_endio(&wpc->ioend->io_bio);
|
|
|
|
} else {
|
|
|
|
submit_bio(&wpc->ioend->io_bio);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:08 +00:00
|
|
|
wpc->ioend = NULL;
|
|
|
|
return error;
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:04 +00:00
|
|
|
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
|
|
|
|
struct writeback_control *wbc, struct inode *inode, loff_t pos)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
|
|
|
struct iomap_ioend *ioend;
|
|
|
|
struct bio *bio;
|
|
|
|
|
2022-01-24 09:11:03 +00:00
|
|
|
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
|
|
|
|
REQ_OP_WRITE | wbc_to_write_flags(wbc),
|
|
|
|
GFP_NOFS, &iomap_ioend_bioset);
|
2023-12-07 07:27:04 +00:00
|
|
|
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
|
2023-12-07 07:27:05 +00:00
|
|
|
bio->bi_end_io = iomap_writepage_end_bio;
|
2019-10-17 20:12:15 +00:00
|
|
|
wbc_init_bio(wbc, bio);
|
2024-03-04 17:35:21 +00:00
|
|
|
bio->bi_write_hint = inode->i_write_hint;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-12-07 07:27:05 +00:00
|
|
|
ioend = iomap_ioend_from_bio(bio);
|
2019-10-17 20:12:15 +00:00
|
|
|
INIT_LIST_HEAD(&ioend->io_list);
|
|
|
|
ioend->io_type = wpc->iomap.type;
|
|
|
|
ioend->io_flags = wpc->iomap.flags;
|
2024-11-04 04:19:16 +00:00
|
|
|
if (pos > wpc->iomap.offset)
|
|
|
|
wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
|
2019-10-17 20:12:15 +00:00
|
|
|
ioend->io_inode = inode;
|
|
|
|
ioend->io_size = 0;
|
2023-12-07 07:27:04 +00:00
|
|
|
ioend->io_offset = pos;
|
|
|
|
ioend->io_sector = bio->bi_iter.bi_sector;
|
2023-12-07 07:26:59 +00:00
|
|
|
|
|
|
|
wpc->nr_folios = 0;
|
2019-10-17 20:12:15 +00:00
|
|
|
return ioend;
|
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:04 +00:00
|
|
|
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2024-11-04 04:19:16 +00:00
|
|
|
if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
|
|
|
|
return false;
|
2019-10-17 20:12:15 +00:00
|
|
|
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
|
|
|
|
(wpc->ioend->io_flags & IOMAP_F_SHARED))
|
|
|
|
return false;
|
|
|
|
if (wpc->iomap.type != wpc->ioend->io_type)
|
|
|
|
return false;
|
2023-12-07 07:27:04 +00:00
|
|
|
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
|
2019-10-17 20:12:15 +00:00
|
|
|
return false;
|
2023-12-07 07:27:04 +00:00
|
|
|
if (iomap_sector(&wpc->iomap, pos) !=
|
2023-12-07 07:27:05 +00:00
|
|
|
bio_end_sector(&wpc->ioend->io_bio))
|
2019-10-17 20:12:15 +00:00
|
|
|
return false;
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
/*
|
|
|
|
* Limit ioend bio chain lengths to minimise IO completion latency. This
|
|
|
|
* also prevents long tight loops ending page writeback on all the
|
|
|
|
* folios in the ioend.
|
|
|
|
*/
|
2023-12-07 07:26:59 +00:00
|
|
|
if (wpc->nr_folios >= IOEND_BATCH_SIZE)
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
return false;
|
2019-10-17 20:12:15 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Test to see if we have an existing ioend structure that we could append to
|
2021-08-02 21:46:31 +00:00
|
|
|
* first; otherwise finish off the current ioend and start another.
|
2023-12-07 07:27:08 +00:00
|
|
|
*
|
|
|
|
* If a new ioend is created and cached, the old ioend is submitted to the block
|
|
|
|
* layer instantly. Batching optimisations are provided by higher level block
|
|
|
|
* plugging.
|
|
|
|
*
|
|
|
|
* At the end of a writeback pass, there will be a cached ioend remaining on the
|
|
|
|
* writepage context that the caller will need to submit.
|
2019-10-17 20:12:15 +00:00
|
|
|
*/
|
2023-12-07 07:27:08 +00:00
|
|
|
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
|
2023-12-07 07:27:03 +00:00
|
|
|
struct writeback_control *wbc, struct folio *folio,
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
struct inode *inode, loff_t pos, loff_t end_pos,
|
|
|
|
unsigned len)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2023-12-07 07:27:03 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2021-11-02 16:45:12 +00:00
|
|
|
size_t poff = offset_in_folio(folio, pos);
|
2023-12-07 07:27:08 +00:00
|
|
|
int error;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-12-07 07:27:04 +00:00
|
|
|
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
|
2023-12-07 07:27:05 +00:00
|
|
|
new_ioend:
|
2023-12-07 07:27:08 +00:00
|
|
|
error = iomap_submit_ioend(wpc, 0);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2023-12-07 07:27:04 +00:00
|
|
|
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:05 +00:00
|
|
|
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
|
|
|
|
goto new_ioend;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-07-10 21:11:19 +00:00
|
|
|
if (ifs)
|
|
|
|
atomic_add(len, &ifs->write_bytes_pending);
|
iomap: fix zero padding data issue in concurrent append writes
During concurrent append writes to XFS filesystem, zero padding data
may appear in the file after power failure. This happens due to imprecise
disk size updates when handling write completion.
Consider this scenario with concurrent append writes same file:
Thread 1: Thread 2:
------------ -----------
write [A, A+B]
update inode size to A+B
submit I/O [A, A+BS]
write [A+B, A+B+C]
update inode size to A+B+C
<I/O completes, updates disk size to min(A+B+C, A+BS)>
<power failure>
After reboot:
1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]
|< Block Size (BS) >|
|DDDDDDDDDDDDDDDD0000000000000000|
^ ^ ^
A A+B A+B+C
(EOF)
2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]
|< Block Size (BS) >|< Block Size (BS) >|
|DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
^ ^ ^ ^
A A+B A+BS A+B+C
(EOF)
D = Valid Data
0 = Zero Padding
The issue stems from disk size being set to min(io_offset + io_size,
inode->i_size) at I/O completion. Since io_offset+io_size is block
size granularity, it may exceed the actual valid file data size. In
the case of concurrent append writes, inode->i_size may be larger
than the actual range of valid file data written to disk, leading to
inaccurate disk size updates.
This patch modifies the meaning of io_size to represent the size of
valid data within EOF in an ioend. If the ioend spans beyond i_size,
io_size will be trimmed to provide the file with more accurate size
information. This is particularly useful for on-disk size updates
at completion time.
After this change, ioends that span i_size will not grow or merge with
other ioends in concurrent scenarios. However, these cases that need
growth/merging rarely occur and it seems no noticeable performance impact.
Although rounding up io_size could enable ioend growth/merging in these
scenarios, we decided to keep the code simple after discussion [1].
Another benefit is that it makes the xfs_ioend_is_append() check more
accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
in certain scenarios, such as repeated writes at the file tail without
extending the file size.
Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com
Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:40 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clamp io_offset and io_size to the incore EOF so that ondisk
|
|
|
|
* file size updates in the ioend completion are byte-accurate.
|
|
|
|
* This avoids recovering files with zeroed tail regions when
|
|
|
|
* writeback races with appending writes:
|
|
|
|
*
|
|
|
|
* Thread 1: Thread 2:
|
|
|
|
* ------------ -----------
|
|
|
|
* write [A, A+B]
|
|
|
|
* update inode size to A+B
|
|
|
|
* submit I/O [A, A+BS]
|
|
|
|
* write [A+B, A+B+C]
|
|
|
|
* update inode size to A+B+C
|
|
|
|
* <I/O completes, updates disk size to min(A+B+C, A+BS)>
|
|
|
|
* <power failure>
|
|
|
|
*
|
|
|
|
* After reboot:
|
|
|
|
* 1) with A+B+C < A+BS, the file has zero padding in range
|
|
|
|
* [A+B, A+B+C]
|
|
|
|
*
|
|
|
|
* |< Block Size (BS) >|
|
|
|
|
* |DDDDDDDDDDDD0000000000000|
|
|
|
|
* ^ ^ ^
|
|
|
|
* A A+B A+B+C
|
|
|
|
* (EOF)
|
|
|
|
*
|
|
|
|
* 2) with A+B+C > A+BS, the file has zero padding in range
|
|
|
|
* [A+B, A+BS]
|
|
|
|
*
|
|
|
|
* |< Block Size (BS) >|< Block Size (BS) >|
|
|
|
|
* |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
|
|
|
|
* ^ ^ ^ ^
|
|
|
|
* A A+B A+BS A+B+C
|
|
|
|
* (EOF)
|
|
|
|
*
|
|
|
|
* D = Valid Data
|
|
|
|
* 0 = Zero Padding
|
|
|
|
*
|
|
|
|
* Note that this defeats the ability to chain the ioends of
|
|
|
|
* appending writes.
|
|
|
|
*/
|
2019-10-17 20:12:15 +00:00
|
|
|
wpc->ioend->io_size += len;
|
iomap: fix zero padding data issue in concurrent append writes
During concurrent append writes to XFS filesystem, zero padding data
may appear in the file after power failure. This happens due to imprecise
disk size updates when handling write completion.
Consider this scenario with concurrent append writes same file:
Thread 1: Thread 2:
------------ -----------
write [A, A+B]
update inode size to A+B
submit I/O [A, A+BS]
write [A+B, A+B+C]
update inode size to A+B+C
<I/O completes, updates disk size to min(A+B+C, A+BS)>
<power failure>
After reboot:
1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]
|< Block Size (BS) >|
|DDDDDDDDDDDDDDDD0000000000000000|
^ ^ ^
A A+B A+B+C
(EOF)
2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]
|< Block Size (BS) >|< Block Size (BS) >|
|DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
^ ^ ^ ^
A A+B A+BS A+B+C
(EOF)
D = Valid Data
0 = Zero Padding
The issue stems from disk size being set to min(io_offset + io_size,
inode->i_size) at I/O completion. Since io_offset+io_size is block
size granularity, it may exceed the actual valid file data size. In
the case of concurrent append writes, inode->i_size may be larger
than the actual range of valid file data written to disk, leading to
inaccurate disk size updates.
This patch modifies the meaning of io_size to represent the size of
valid data within EOF in an ioend. If the ioend spans beyond i_size,
io_size will be trimmed to provide the file with more accurate size
information. This is particularly useful for on-disk size updates
at completion time.
After this change, ioends that span i_size will not grow or merge with
other ioends in concurrent scenarios. However, these cases that need
growth/merging rarely occur and it seems no noticeable performance impact.
Although rounding up io_size could enable ioend growth/merging in these
scenarios, we decided to keep the code simple after discussion [1].
Another benefit is that it makes the xfs_ioend_is_append() check more
accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
in certain scenarios, such as repeated writes at the file tail without
extending the file size.
Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com
Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:40 +00:00
|
|
|
if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
|
|
|
|
wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
|
|
|
|
|
2024-09-26 14:01:21 +00:00
|
|
|
wbc_account_cgroup_owner(wbc, folio, len);
|
2023-12-07 07:27:08 +00:00
|
|
|
return 0;
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:07 +00:00
|
|
|
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
|
|
|
|
struct writeback_control *wbc, struct folio *folio,
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
struct inode *inode, u64 pos, u64 end_pos,
|
|
|
|
unsigned dirty_len, unsigned *count)
|
2023-12-07 07:27:07 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2023-12-07 07:27:09 +00:00
|
|
|
do {
|
|
|
|
unsigned map_len;
|
|
|
|
|
2023-12-07 07:27:10 +00:00
|
|
|
error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
|
2023-12-07 07:27:09 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
2024-02-20 11:57:59 +00:00
|
|
|
trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
|
2023-12-07 07:27:09 +00:00
|
|
|
|
|
|
|
map_len = min_t(u64, dirty_len,
|
|
|
|
wpc->iomap.offset + wpc->iomap.length - pos);
|
|
|
|
WARN_ON_ONCE(!folio->private && map_len < dirty_len);
|
|
|
|
|
|
|
|
switch (wpc->iomap.type) {
|
|
|
|
case IOMAP_INLINE:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
error = -EIO;
|
|
|
|
break;
|
|
|
|
case IOMAP_HOLE:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
end_pos, map_len);
|
2023-12-07 07:27:09 +00:00
|
|
|
if (!error)
|
|
|
|
(*count)++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
dirty_len -= map_len;
|
|
|
|
pos += map_len;
|
|
|
|
} while (dirty_len && !error);
|
2023-12-07 07:27:07 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We cannot cancel the ioend directly here on error. We may have
|
|
|
|
* already set other pages under writeback and hence we have to run I/O
|
|
|
|
* completion to mark the error state of the pages under writeback
|
|
|
|
* appropriately.
|
|
|
|
*
|
|
|
|
* Just let the file system know what portion of the folio failed to
|
|
|
|
* map.
|
|
|
|
*/
|
|
|
|
if (error && wpc->ops->discard_folio)
|
|
|
|
wpc->ops->discard_folio(folio, pos);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:01 +00:00
|
|
|
/*
|
|
|
|
* Check interaction of the folio with the file end.
|
|
|
|
*
|
|
|
|
* If the folio is entirely beyond i_size, return false. If it straddles
|
|
|
|
* i_size, adjust end_pos and zero all data beyond i_size.
|
|
|
|
*/
|
|
|
|
static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
|
|
|
|
u64 *end_pos)
|
|
|
|
{
|
|
|
|
u64 isize = i_size_read(inode);
|
|
|
|
|
|
|
|
if (*end_pos > isize) {
|
|
|
|
size_t poff = offset_in_folio(folio, isize);
|
|
|
|
pgoff_t end_index = isize >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the folio is entirely ouside of i_size, skip it.
|
|
|
|
*
|
|
|
|
* This can happen due to a truncate operation that is in
|
|
|
|
* progress and in that case truncate will finish it off once
|
|
|
|
* we've dropped the folio lock.
|
|
|
|
*
|
|
|
|
* Note that the pgoff_t used for end_index is an unsigned long.
|
|
|
|
* If the given offset is greater than 16TB on a 32-bit system,
|
|
|
|
* then if we checked if the folio is fully outside i_size with
|
|
|
|
* "if (folio->index >= end_index + 1)", "end_index + 1" would
|
|
|
|
* overflow and evaluate to 0. Hence this folio would be
|
|
|
|
* redirtied and written out repeatedly, which would result in
|
|
|
|
* an infinite loop; the user program performing this operation
|
|
|
|
* would hang. Instead, we can detect this situation by
|
|
|
|
* checking if the folio is totally beyond i_size or if its
|
|
|
|
* offset is just equal to the EOF.
|
|
|
|
*/
|
|
|
|
if (folio->index > end_index ||
|
|
|
|
(folio->index == end_index && poff == 0))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The folio straddles i_size.
|
|
|
|
*
|
|
|
|
* It must be zeroed out on each and every writepage invocation
|
|
|
|
* because it may be mmapped:
|
|
|
|
*
|
|
|
|
* A file is mapped in multiples of the page size. For a
|
|
|
|
* file that is not a multiple of the page size, the
|
|
|
|
* remaining memory is zeroed when mapped, and writes to that
|
|
|
|
* region are not written out to the file.
|
|
|
|
*
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
* Also adjust the end_pos to the end of file and skip writeback
|
|
|
|
* for all blocks entirely beyond i_size.
|
2023-12-07 07:27:01 +00:00
|
|
|
*/
|
|
|
|
folio_zero_segment(folio, poff, folio_size(folio));
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
*end_pos = isize;
|
2023-12-07 07:27:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:02 +00:00
|
|
|
static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
|
|
|
|
struct writeback_control *wbc, struct folio *folio)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
2023-07-10 21:12:43 +00:00
|
|
|
struct iomap_folio_state *ifs = folio->private;
|
2023-12-07 07:27:02 +00:00
|
|
|
struct inode *inode = folio->mapping->host;
|
2021-11-02 14:51:55 +00:00
|
|
|
u64 pos = folio_pos(folio);
|
2023-12-07 07:27:02 +00:00
|
|
|
u64 end_pos = pos + folio_size(folio);
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
u64 end_aligned = 0;
|
2023-12-07 07:27:07 +00:00
|
|
|
unsigned count = 0;
|
2023-12-07 07:27:09 +00:00
|
|
|
int error = 0;
|
|
|
|
u32 rlen;
|
2023-12-07 07:27:08 +00:00
|
|
|
|
|
|
|
WARN_ON_ONCE(!folio_test_locked(folio));
|
|
|
|
WARN_ON_ONCE(folio_test_dirty(folio));
|
|
|
|
WARN_ON_ONCE(folio_test_writeback(folio));
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-12-07 07:27:02 +00:00
|
|
|
trace_iomap_writepage(inode, pos, folio_size(folio));
|
|
|
|
|
|
|
|
if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
|
|
|
|
folio_unlock(folio);
|
|
|
|
return 0;
|
|
|
|
}
|
2023-07-10 21:12:43 +00:00
|
|
|
WARN_ON_ONCE(end_pos <= pos);
|
|
|
|
|
2023-12-07 07:27:09 +00:00
|
|
|
if (i_blocks_per_folio(inode, folio) > 1) {
|
2023-12-07 07:27:08 +00:00
|
|
|
if (!ifs) {
|
|
|
|
ifs = ifs_alloc(inode, folio, 0);
|
|
|
|
iomap_set_range_dirty(folio, 0, end_pos - pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Keep the I/O completion handler from clearing the writeback
|
|
|
|
* bit until we have submitted all blocks by adding a bias to
|
|
|
|
* ifs->write_bytes_pending, which is dropped after submitting
|
|
|
|
* all blocks.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
|
|
|
|
atomic_inc(&ifs->write_bytes_pending);
|
2023-07-10 21:12:43 +00:00
|
|
|
}
|
|
|
|
|
2023-12-07 07:27:08 +00:00
|
|
|
/*
|
|
|
|
* Set the writeback bit ASAP, as the I/O completion for the single
|
|
|
|
* block per folio case happen hit as soon as we're submitting the bio.
|
|
|
|
*/
|
|
|
|
folio_start_writeback(folio);
|
2019-10-17 20:12:15 +00:00
|
|
|
|
|
|
|
/*
|
2023-12-07 07:27:09 +00:00
|
|
|
* Walk through the folio to find dirty areas to write back.
|
2019-10-17 20:12:15 +00:00
|
|
|
*/
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
end_aligned = round_up(end_pos, i_blocksize(inode));
|
|
|
|
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
|
2023-12-07 07:27:09 +00:00
|
|
|
error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
|
iomap: pass byte granular end position to iomap_add_to_ioend
This is a preparatory patch for fixing zero padding issues in concurrent
append write scenarios. In the following patches, we need to obtain
byte-granular writeback end position for io_size trimming after EOF
handling.
Due to concurrent writeback and truncate operations, inode size may
shrink. Resampling inode size would force writeback code to handle the
newly appeared post-EOF blocks, which is undesirable. As Dave
explained in [1]:
"Really, the issue is that writeback mappings have to be able to
handle the range being mapped suddenly appear to be beyond EOF.
This behaviour is a longstanding writeback constraint, and is what
iomap_writepage_handle_eof() is attempting to handle.
We handle this by only sampling i_size_read() whilst we have the
folio locked and can determine the action we should take with that
folio (i.e. nothing, partial zeroing, or skip altogether). Once
we've made the decision that the folio is within EOF and taken
action on it (i.e. moved the folio to writeback state), we cannot
then resample the inode size because a truncate may have started
and changed the inode size."
To avoid resampling inode size after EOF handling, we convert end_pos
to byte-granular writeback position and return it from EOF handling
function.
Since iomap_set_range_dirty() can handle unaligned lengths, this
conversion has no impact on it. However, iomap_find_dirty_range()
requires aligned start and end range to find dirty blocks within the
given range, so the end position needs to be rounded up when passed
to it.
LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/
Signed-off-by: Long Li <leo.lilong@huawei.com>
Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-09 11:42:39 +00:00
|
|
|
pos, end_pos, rlen, &count);
|
2019-10-17 20:12:15 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
2023-12-07 07:27:09 +00:00
|
|
|
pos += rlen;
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
2023-12-07 07:27:09 +00:00
|
|
|
|
xfs, iomap: limit individual ioend chain lengths in writeback
Trond Myklebust reported soft lockups in XFS IO completion such as
this:
watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [kworker/12:1:3106]
CPU: 12 PID: 3106 Comm: kworker/12:1 Not tainted 4.18.0-305.10.2.el8_4.x86_64 #1
Workqueue: xfs-conv/md127 xfs_end_io [xfs]
RIP: 0010:_raw_spin_unlock_irqrestore+0x11/0x20
Call Trace:
wake_up_page_bit+0x8a/0x110
iomap_finish_ioend+0xd7/0x1c0
iomap_finish_ioends+0x7f/0xb0
xfs_end_ioend+0x6b/0x100 [xfs]
xfs_end_io+0xb9/0xe0 [xfs]
process_one_work+0x1a7/0x360
worker_thread+0x1fa/0x390
kthread+0x116/0x130
ret_from_fork+0x35/0x40
Ioends are processed as an atomic completion unit when all the
chained bios in the ioend have completed their IO. Logically
contiguous ioends can also be merged and completed as a single,
larger unit. Both of these things can be problematic as both the
bio chains per ioend and the size of the merged ioends processed as
a single completion are both unbound.
If we have a large sequential dirty region in the page cache,
write_cache_pages() will keep feeding us sequential pages and we
will keep mapping them into ioends and bios until we get a dirty
page at a non-sequential file offset. These large sequential runs
can will result in bio and ioend chaining to optimise the io
patterns. The pages iunder writeback are pinned within these chains
until the submission chaining is broken, allowing the entire chain
to be completed. This can result in huge chains being processed
in IO completion context.
We get deep bio chaining if we have large contiguous physical
extents. We will keep adding pages to the current bio until it is
full, then we'll chain a new bio to keep adding pages for writeback.
Hence we can build bio chains that map millions of pages and tens of
gigabytes of RAM if the page cache contains big enough contiguous
dirty file regions. This long bio chain pins those pages until the
final bio in the chain completes and the ioend can iterate all the
chained bios and complete them.
OTOH, if we have a physically fragmented file, we end up submitting
one ioend per physical fragment that each have a small bio or bio
chain attached to them. We do not chain these at IO submission time,
but instead we chain them at completion time based on file
offset via iomap_ioend_try_merge(). Hence we can end up with unbound
ioend chains being built via completion merging.
XFS can then do COW remapping or unwritten extent conversion on that
merged chain, which involves walking an extent fragment at a time
and running a transaction to modify the physical extent information.
IOWs, we merge all the discontiguous ioends together into a
contiguous file range, only to then process them individually as
discontiguous extents.
This extent manipulation is computationally expensive and can run in
a tight loop, so merging logically contiguous but physically
discontigous ioends gains us nothing except for hiding the fact the
fact we broke the ioends up into individual physical extents at
submission and then need to loop over those individual physical
extents at completion.
Hence we need to have mechanisms to limit ioend sizes and
to break up completion processing of large merged ioend chains:
1. bio chains per ioend need to be bound in length. Pure overwrites
go straight to iomap_finish_ioend() in softirq context with the
exact bio chain attached to the ioend by submission. Hence the only
way to prevent long holdoffs here is to bound ioend submission
sizes because we can't reschedule in softirq context.
2. iomap_finish_ioends() has to handle unbound merged ioend chains
correctly. This relies on any one call to iomap_finish_ioend() being
bound in runtime so that cond_resched() can be issued regularly as
the long ioend chain is processed. i.e. this relies on mechanism #1
to limit individual ioend sizes to work correctly.
3. filesystems have to loop over the merged ioends to process
physical extent manipulations. This means they can loop internally,
and so we break merging at physical extent boundaries so the
filesystem can easily insert reschedule points between individual
extent manipulations.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reported-and-tested-by: Trond Myklebust <trondmy@hammerspace.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-01-26 17:19:20 +00:00
|
|
|
if (count)
|
2023-12-07 07:26:59 +00:00
|
|
|
wpc->nr_folios++;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-07-10 21:12:43 +00:00
|
|
|
/*
|
|
|
|
* We can have dirty bits set past end of file in page_mkwrite path
|
|
|
|
* while mapping the last partial folio. Hence it's better to clear
|
|
|
|
* all the dirty bits in the folio here.
|
|
|
|
*/
|
|
|
|
iomap_clear_range_dirty(folio, 0, folio_size(folio));
|
2023-12-07 07:26:57 +00:00
|
|
|
|
|
|
|
/*
|
2023-12-07 07:27:08 +00:00
|
|
|
* Usually the writeback bit is cleared by the I/O completion handler.
|
|
|
|
* But we may end up either not actually writing any blocks, or (when
|
|
|
|
* there are multiple blocks in a folio) all I/O might have finished
|
|
|
|
* already at this point. In that case we need to clear the writeback
|
|
|
|
* bit ourselves right after unlocking the page.
|
2023-12-07 07:26:57 +00:00
|
|
|
*/
|
2021-11-02 16:45:12 +00:00
|
|
|
folio_unlock(folio);
|
2023-12-07 07:27:08 +00:00
|
|
|
if (ifs) {
|
|
|
|
if (atomic_dec_and_test(&ifs->write_bytes_pending))
|
|
|
|
folio_end_writeback(folio);
|
|
|
|
} else {
|
|
|
|
if (!count)
|
|
|
|
folio_end_writeback(folio);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
iomap: iomap: fix memory corruption when recording errors during writeback
Every now and then I see this crash on arm64:
Unable to handle kernel NULL pointer dereference at virtual address 00000000000000f8
Buffer I/O error on dev dm-0, logical block 8733687, async page read
Mem abort info:
ESR = 0x0000000096000006
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
FSC = 0x06: level 2 translation fault
Data abort info:
ISV = 0, ISS = 0x00000006
CM = 0, WnR = 0
user pgtable: 64k pages, 42-bit VAs, pgdp=0000000139750000
[00000000000000f8] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000, pmd=0000000000000000
Internal error: Oops: 96000006 [#1] PREEMPT SMP
Buffer I/O error on dev dm-0, logical block 8733688, async page read
Dumping ftrace buffer:
Buffer I/O error on dev dm-0, logical block 8733689, async page read
(ftrace buffer empty)
XFS (dm-0): log I/O error -5
Modules linked in: dm_thin_pool dm_persistent_data
XFS (dm-0): Metadata I/O Error (0x1) detected at xfs_trans_read_buf_map+0x1ec/0x590 [xfs] (fs/xfs/xfs_trans_buf.c:296).
dm_bio_prison
XFS (dm-0): Please unmount the filesystem and rectify the problem(s)
XFS (dm-0): xfs_imap_lookup: xfs_ialloc_read_agi() returned error -5, agno 0
dm_bufio dm_log_writes xfs nft_chain_nat xt_REDIRECT nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6t_REJECT
potentially unexpected fatal signal 6.
nf_reject_ipv6
potentially unexpected fatal signal 6.
ipt_REJECT nf_reject_ipv4
CPU: 1 PID: 122166 Comm: fsstress Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7
rpcsec_gss_krb5 auth_rpcgss xt_tcpudp ip_set_hash_ip ip_set_hash_net xt_set nft_compat ip_set_hash_mac ip_set nf_tables
Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021
pstate: 60001000 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
ip_tables
pc : 000003fd6d7df200
x_tables
lr : 000003fd6d7df1ec
overlay nfsv4
CPU: 0 PID: 54031 Comm: u4:3 Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7405
Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021
Workqueue: writeback wb_workfn
sp : 000003ffd9522fd0
(flush-253:0)
pstate: 60401005 (nZCv daif +PAN -UAO -TCO -DIT +SSBS BTYPE=--)
pc : errseq_set+0x1c/0x100
x29: 000003ffd9522fd0 x28: 0000000000000023 x27: 000002acefeb6780
x26: 0000000000000005 x25: 0000000000000001 x24: 0000000000000000
x23: 00000000ffffffff x22: 0000000000000005
lr : __filemap_set_wb_err+0x24/0xe0
x21: 0000000000000006
sp : fffffe000f80f760
x29: fffffe000f80f760 x28: 0000000000000003 x27: fffffe000f80f9f8
x26: 0000000002523000 x25: 00000000fffffffb x24: fffffe000f80f868
x23: fffffe000f80fbb0 x22: fffffc0180c26a78 x21: 0000000002530000
x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
x14: 0000000000000001 x13: 0000000000470af3 x12: fffffc0058f70000
x11: 0000000000000040 x10: 0000000000001b20 x9 : fffffe000836b288
x8 : fffffc00eb9fd480 x7 : 0000000000f83659 x6 : 0000000000000000
x5 : 0000000000000869 x4 : 0000000000000005 x3 : 00000000000000f8
x20: 000003fd6d740020 x19: 000000000001dd36 x18: 0000000000000001
x17: 000003fd6d78704c x16: 0000000000000001 x15: 000002acfac87668
x2 : 0000000000000ffa x1 : 00000000fffffffb x0 : 00000000000000f8
Call trace:
errseq_set+0x1c/0x100
__filemap_set_wb_err+0x24/0xe0
iomap_do_writepage+0x5e4/0xd5c
write_cache_pages+0x208/0x674
iomap_writepages+0x34/0x60
xfs_vm_writepages+0x8c/0xcc [xfs 7a861f39c43631f15d3a5884246ba5035d4ca78b]
x14: 0000000000000000 x13: 2064656e72757465 x12: 0000000000002180
x11: 000003fd6d8a82d0 x10: 0000000000000000 x9 : 000003fd6d8ae288
x8 : 0000000000000083 x7 : 00000000ffffffff x6 : 00000000ffffffee
x5 : 00000000fbad2887 x4 : 000003fd6d9abb58 x3 : 000003fd6d740020
x2 : 0000000000000006 x1 : 000000000001dd36 x0 : 0000000000000000
CPU: 1 PID: 122167 Comm: fsstress Tainted: G W 6.0.0-rc5-djwa #rc5 3004c9f1de887ebae86015f2677638ce51ee7
do_writepages+0x90/0x1c4
__writeback_single_inode+0x4c/0x4ac
Hardware name: QEMU KVM Virtual Machine, BIOS 1.5.1 06/16/2021
writeback_sb_inodes+0x214/0x4ac
wb_writeback+0xf4/0x3b0
pstate: 60001000 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
wb_workfn+0xfc/0x580
process_one_work+0x1e8/0x480
pc : 000003fd6d7df200
worker_thread+0x78/0x430
This crash is a result of iomap_writepage_map encountering some sort of
error during writeback and wanting to set that error code in the file
mapping so that fsync will report it. Unfortunately, the code
dereferences folio->mapping after unlocking the folio, which means that
another thread could have removed the page from the page cache
(writeback doesn't hold the invalidation lock) and give it to somebody
else.
At best we crash the system like above; at worst, we corrupt memory or
set an error on some other unsuspecting file while failing to record the
problems with *this* file. Regardless, fix the problem by reporting the
error to the inode mapping.
NOTE: Commit 598ecfbaa742 lifted the XFS writeback code to iomap, so
this fix should be backported to XFS in the 4.6-5.4 kernels in addition
to iomap in the 5.5-5.19 kernels.
Fixes: e735c0079465 ("iomap: Convert iomap_add_to_ioend() to take a folio") # 5.17 onward
Fixes: 598ecfbaa742 ("iomap: lift the xfs writeback code to iomap") # 5.5-5.16, needs backporting
Fixes: 150d5be09ce4 ("xfs: remove xfs_cancel_ioend") # 4.6-5.4, needs backporting
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
2022-10-01 00:02:32 +00:00
|
|
|
mapping_set_error(inode->i_mapping, error);
|
2019-10-17 20:12:15 +00:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
|
|
|
|
struct iomap_writepage_ctx *wpc,
|
|
|
|
const struct iomap_writeback_ops *ops)
|
|
|
|
{
|
2024-04-12 06:16:14 +00:00
|
|
|
struct folio *folio = NULL;
|
|
|
|
int error;
|
2019-10-17 20:12:15 +00:00
|
|
|
|
2023-12-07 07:27:00 +00:00
|
|
|
/*
|
|
|
|
* Writeback from reclaim context should never happen except in the case
|
|
|
|
* of a VM regression so warn about it and refuse to write the data.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
|
|
|
|
PF_MEMALLOC))
|
|
|
|
return -EIO;
|
|
|
|
|
2019-10-17 20:12:15 +00:00
|
|
|
wpc->ops = ops;
|
2024-04-12 06:16:14 +00:00
|
|
|
while ((folio = writeback_iter(mapping, wbc, folio, &error)))
|
|
|
|
error = iomap_writepage_map(wpc, wbc, folio);
|
|
|
|
return iomap_submit_ioend(wpc, error);
|
2019-10-17 20:12:15 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(iomap_writepages);
|
|
|
|
|
2024-08-22 13:50:14 +00:00
|
|
|
static int __init iomap_buffered_init(void)
|
2019-10-17 20:12:15 +00:00
|
|
|
{
|
|
|
|
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
2023-12-07 07:27:05 +00:00
|
|
|
offsetof(struct iomap_ioend, io_bio),
|
2019-10-17 20:12:15 +00:00
|
|
|
BIOSET_NEED_BVECS);
|
|
|
|
}
|
2024-08-22 13:50:14 +00:00
|
|
|
fs_initcall(iomap_buffered_init);
|