diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 91ee0b308e13..356193e44cf0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; @@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } + + /* + * Now we have a locked folio, before we do anything with it we need to + * check that the iomap we have cached is not stale. The inode extent + * mapping can change due to concurrent IO in flight (e.g. + * IOMAP_UNWRITTEN state can change and memory reclaim could have + * reclaimed a previously partially written page at this index after IO + * completion before this write reaches this file offset) and hence we + * could do the wrong thing here (zero a page range incorrectly or fail + * to zero) and corrupt data. + */ + if (page_ops && page_ops->iomap_valid) { + bool iomap_valid = page_ops->iomap_valid(iter->inode, + &iter->iomap); + if (!iomap_valid) { + iter->iomap.flags |= IOMAP_F_STALE; + status = 0; + goto out_unlock; + } + } + if (pos + len > folio_pos(folio) + folio_size(folio)) len = folio_pos(folio) + folio_size(folio) - pos; @@ -773,6 +794,8 @@ again: status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) break; + if (iter->iomap.flags & IOMAP_F_STALE) + break; page = folio_file_page(folio, pos >> PAGE_SHIFT); if (mapping_writably_mapped(mapping)) @@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +/* + * Scan the data range passed to us for dirty page cache folios. If we find a + * dirty folio, punch out the preceeding range and update the offset from which + * the next punch will start from. + * + * We can punch out storage reservations under clean pages because they either + * contain data that has been written back - in which case the delalloc punch + * over that range is a no-op - or they have been read faults in which case they + * contain zeroes and we can remove the delalloc backing range and any new + * writes to those pages will do the normal hole filling operation... + * + * This makes the logic simple: we only need to keep the delalloc extents only + * over the dirty ranges of the page cache. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + */ +static int iomap_write_delalloc_scan(struct inode *inode, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t offset, loff_t length)) +{ + while (start_byte < end_byte) { + struct folio *folio; + + /* grab locked page */ + folio = filemap_lock_folio(inode->i_mapping, + start_byte >> PAGE_SHIFT); + if (!folio) { + start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + + PAGE_SIZE; + continue; + } + + /* if dirty, punch up to offset */ + if (folio_test_dirty(folio)) { + if (start_byte > *punch_start_byte) { + int error; + + error = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (error) { + folio_unlock(folio); + folio_put(folio); + return error; + } + } + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_next_index(folio) << PAGE_SHIFT); + } + + /* move offset to start of next folio in range */ + start_byte = folio_next_index(folio) << PAGE_SHIFT; + folio_unlock(folio); + folio_put(folio); + } + return 0; +} + +/* + * Punch out all the delalloc blocks in the range given except for those that + * have dirty data still pending in the page cache - those are going to be + * written and so must still retain the delalloc backing for writeback. + * + * As we are scanning the page cache for data, we don't need to reimplement the + * wheel - mapping_seek_hole_data() does exactly what we need to identify the + * start and end of data ranges correctly even for sub-folio block sizes. This + * byte range based iteration is especially convenient because it means we + * don't have to care about variable size folios, nor where the start or end of + * the data range lies within a folio, if they lie within the same folio or even + * if there are multiple discontiguous data ranges within the folio. + * + * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so + * can return data ranges that exist in the cache beyond EOF. e.g. a page fault + * spanning EOF will initialise the post-EOF data to zeroes and mark it up to + * date. A write page fault can then mark it dirty. If we then fail a write() + * beyond EOF into that up to date cached range, we allocate a delalloc block + * beyond EOF and then have to punch it out. Because the range is up to date, + * mapping_seek_hole_data() will return it, and we will skip the punch because + * the folio is dirty. THis is incorrect - we always need to punch out delalloc + * beyond EOF in this case as writeback will never write back and covert that + * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, + * resulting in always punching out the range from the EOF to the end of the + * range the iomap spans. + * + * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it + * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA + * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) + * returns the end of the data range (data_end). Using closed intervals would + * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose + * the code to subtle off-by-one bugs.... + */ +static int iomap_write_delalloc_release(struct inode *inode, + loff_t start_byte, loff_t end_byte, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t punch_start_byte = start_byte; + loff_t scan_end_byte = min(i_size_read(inode), end_byte); + int error = 0; + + /* + * Lock the mapping to avoid races with page faults re-instantiating + * folios and dirtying them via ->page_mkwrite whilst we walk the + * cache and perform delalloc extent removal. Failing to do this can + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); + while (start_byte < scan_end_byte) { + loff_t data_end; + + start_byte = mapping_seek_hole_data(inode->i_mapping, + start_byte, scan_end_byte, SEEK_DATA); + /* + * If there is no more data to scan, all that is left is to + * punch out the remaining range. + */ + if (start_byte == -ENXIO || start_byte == scan_end_byte) + break; + if (start_byte < 0) { + error = start_byte; + goto out_unlock; + } + WARN_ON_ONCE(start_byte < punch_start_byte); + WARN_ON_ONCE(start_byte > scan_end_byte); + + /* + * We find the end of this contiguous cached data range by + * seeking from start_byte to the beginning of the next hole. + */ + data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, + scan_end_byte, SEEK_HOLE); + if (data_end < 0) { + error = data_end; + goto out_unlock; + } + WARN_ON_ONCE(data_end <= start_byte); + WARN_ON_ONCE(data_end > scan_end_byte); + + error = iomap_write_delalloc_scan(inode, &punch_start_byte, + start_byte, data_end, punch); + if (error) + goto out_unlock; + + /* The next data search starts at the end of this one. */ + start_byte = data_end; + } + + if (punch_start_byte < end_byte) + error = punch(inode, punch_start_byte, + end_byte - punch_start_byte); +out_unlock: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + +/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc + * extents from the range that are not dirty in the page cache. As the write can + * race with page faults, there can be dirty pages over the delalloc extent + * outside the range of a short write but still within the delalloc extent + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to + * simplify range iterations. + * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + */ +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, + ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)) +{ + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); + + if (iomap->type != IOMAP_DELALLOC) + return 0; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (!(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * start_byte refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in + * the range. + */ + if (unlikely(!written)) + start_byte = round_down(pos, blocksize); + else + start_byte = round_up(pos + written, blocksize); + end_byte = round_up(pos + length, blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ + if (start_byte >= end_byte) + return 0; + + return iomap_write_delalloc_release(inode, start_byte, end_byte, + punch); +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; @@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; status = iomap_write_end(iter, pos, bytes, bytes, folio); if (WARN_ON_ONCE(status == 0)) @@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; + if (iter->iomap.flags & IOMAP_F_STALE) + break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index a1c7592d2ade..79a0614eaab7 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,12 +7,28 @@ #include #include "trace.h" +/* + * Advance to the next range we need to map. + * + * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully + * processed - it was aborted because the extent the iomap spanned may have been + * changed during the operation. In this case, the iteration behaviour is to + * remap the unprocessed range of the iter, and that means we may need to remap + * even when we've made no progress (i.e. iter->processed = 0). Hence the + * "finished iterating" case needs to distinguish between + * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we + * need to remap the entire remaining range. + */ static inline int iomap_iter_advance(struct iomap_iter *iter) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + /* handle the previous iteration (if any) */ if (iter->iomap.length) { - if (iter->processed <= 0) + if (iter->processed < 0) return iter->processed; + if (!iter->processed && !stale) + return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; @@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 49d0d4ea63fc..56b9b7db38bb 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 5362908164b0..580ccbd5aadc 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -40,13 +40,12 @@ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 #define XFS_ERRTAG_BMAP_FINISH_ONE 26 #define XFS_ERRTAG_AG_RESV_CRITICAL 27 + /* - * DEBUG mode instrumentation to test and/or trigger delayed allocation - * block killing in the event of failed writes. When enabled, all - * buffered writes are silenty dropped and handled as if they failed. - * All delalloc blocks in the range of the write (including pre-existing - * delalloc blocks!) are tossed as part of the write failure error - * handling sequence. + * Drop-writes support removed because write error handling cannot trash + * pre-existing delalloc extents in any useful way anymore. We retain the + * definition so that we can reject it as an invalid value in + * xfs_errortag_valid(). */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 @@ -95,7 +94,6 @@ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 #define XFS_RANDOM_BMAP_FINISH_ONE 1 #define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5d1a995b15f8..a22d90af40c8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -114,9 +114,8 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, offset), - XFS_B_TO_FSB(mp, size)); + xfs_bmap_punch_delalloc_range(ip, offset, + offset + size); } goto done; } @@ -373,7 +372,7 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: @@ -455,12 +454,8 @@ xfs_discard_folio( struct folio *folio, loff_t pos) { - struct inode *inode = folio->mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(folio->mapping->host); struct xfs_mount *mp = ip->i_mount; - size_t offset = offset_in_folio(folio, pos); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); - xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); int error; if (xfs_is_shutdown(mp)) @@ -470,8 +465,9 @@ xfs_discard_folio( "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", folio, ip->i_ino, pos); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_folio(inode, folio) - pageoff_fsb); + error = xfs_bmap_punch_delalloc_range(ip, pos, + round_up(pos, folio_size(folio))); + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 04d0c2bff67c..867645b74d88 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -590,11 +590,13 @@ out_unlock_iolock: int xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) + xfs_off_t start_byte, + xfs_off_t end_byte) { + struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = &ip->i_df; - xfs_fileoff_t end_fsb = start_fsb + length; + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( while (got.br_startoff + got.br_blockcount > start_fsb) { del = got; - xfs_trim_extent(&del, start_fsb, length); + xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); /* * A delete can push the cursor forward. Step back to the diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 24b37d211f1d..6888078f5c31 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); + xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f18..dea3c0649d2f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@ -256,6 +254,19 @@ xfs_errortag_del( kmem_free(mp->m_errortag); } +static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@ -277,7 +288,9 @@ xfs_errortag_test( if (!mp->m_errortag) return false; - ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; if (!randfactor || prandom_u32_max(randfactor)) return false; @@ -293,7 +306,7 @@ xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; @@ -305,7 +318,7 @@ xfs_errortag_set( unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; @@ -319,7 +332,7 @@ xfs_errortag_add( { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e462d39c840e..595a5bcf46b9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1325,7 +1325,7 @@ __xfs_filemap_fault( if (write_fault) { xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, - &xfs_buffered_write_iomap_ops); + &xfs_page_mkwrite_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { ret = filemap_fault(vmf); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..1bdd7afc1010 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -48,13 +48,45 @@ xfs_alert_fsblock_zero( return -EFSCORRUPTED; } +u64 +xfs_iomap_inode_sequence( + struct xfs_inode *ip, + u16 iomap_flags) +{ + u64 cookie = 0; + + if (iomap_flags & IOMAP_F_XATTR) + return READ_ONCE(ip->i_af.if_seq); + if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) + cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; + return cookie | READ_ONCE(ip->i_df.if_seq); +} + +/* + * Check that the iomap passed to us is still valid for the given offset and + * length. + */ +static bool +xfs_iomap_valid( + struct inode *inode, + const struct iomap *iomap) +{ + return iomap->validity_cookie == + xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); +} + +const struct iomap_page_ops xfs_iomap_page_ops = { + .iomap_valid = xfs_iomap_valid, +}; + int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags) + u16 iomap_flags, + u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); @@ -91,6 +123,9 @@ xfs_bmbt_to_iomap( if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; + + iomap->validity_cookie = sequence_cookie; + iomap->page_ops = &xfs_iomap_page_ops; return 0; } @@ -195,7 +230,8 @@ xfs_iomap_write_direct( xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -285,6 +321,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: + *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin( bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin( goto out_unlock; } + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; @@ -839,24 +878,26 @@ allocate_blocks: xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, - flags, &imap); + flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - iomap_flags | IOMAP_F_NEW); + iomap_flags | IOMAP_F_NEW, seq); out_found_cow: - xfs_iunlock(ip, lockmode); length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + seq = xfs_iomap_inode_sequence(ip, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) @@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1094,32 +1136,47 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) - return error; + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } +static int +xfs_buffered_write_delalloc_punch( + struct inode *inode, + loff_t offset, + loff_t length) +{ + return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, + offset + length); +} + static int xfs_buffered_write_iomap_end( struct inode *inode, @@ -1129,56 +1186,17 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error = 0; - if (iomap->type != IOMAP_DELALLOC) - return 0; + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; - /* - * Behave as if the write failed if drop writes is enabled. Set the NEW - * flag to force delalloc cleanup. - */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { - iomap->flags |= IOMAP_F_NEW; - written = 0; + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); + return error; } - - /* - * start_fsb refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_fsb = XFS_B_TO_FSBT(mp, offset); - else - start_fsb = XFS_B_TO_FSB(mp, offset + written); - end_fsb = XFS_B_TO_FSB(mp, offset + length); - - /* - * Trim delalloc blocks if they were allocated by this write and we - * didn't manage to write the whole range. - * - * We don't need to care about racing delalloc as we hold i_mutex - * across the reserve/allocate/unreserve calls. If there are delalloc - * blocks in the range, they are ours. - */ - if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { - truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), - XFS_FSB_TO_B(mp, end_fsb) - 1); - - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error && !xfs_is_shutdown(mp)) { - xfs_alert(mp, "%s: unable to clean up ino %lld", - __func__, ip->i_ino); - return error; - } - } - return 0; } @@ -1187,6 +1205,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; +/* + * iomap_page_mkwrite() will never fail in a way that requires delalloc extents + * that it allocated to be revoked. Hence we do not need an .iomap_end method + * for this operation. + */ +const struct iomap_ops xfs_page_mkwrite_iomap_ops = { + .iomap_begin = xfs_buffered_write_iomap_begin, +}; + static int xfs_read_iomap_begin( struct inode *inode, @@ -1204,6 +1231,7 @@ xfs_read_iomap_begin( int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; + u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); @@ -1217,13 +1245,14 @@ xfs_read_iomap_begin( &nimaps, 0); if (!error && (flags & IOMAP_REPORT)) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, - shared ? IOMAP_F_SHARED : 0); + shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { @@ -1248,6 +1277,7 @@ xfs_seek_iomap_begin( struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1282,8 +1312,9 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED); + IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1304,8 +1335,9 @@ xfs_seek_iomap_begin( imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: + seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1331,6 +1363,7 @@ xfs_xattr_iomap_begin( struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; + int seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -1347,12 +1380,14 @@ xfs_xattr_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: + + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c782e8c0479c..4da13440bae9 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,14 +13,15 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, - struct xfs_bmbt_irec *imap); + struct xfs_bmbt_irec *imap, u64 *sequence); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, xfs_fileoff_t end_fsb); +u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, - u16 iomap_flags); + u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero); @@ -47,6 +48,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; +extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 37a24f0f7cd4..38d23f0e703a 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -125,6 +125,7 @@ xfs_fs_map_blocks( int nimaps = 1; uint lock_flags; int error = 0; + u64 seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -176,6 +177,7 @@ xfs_fs_map_blocks( lock_flags = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, bmapi_flags); + seq = xfs_iomap_inode_sequence(ip, 0); ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); @@ -189,7 +191,7 @@ xfs_fs_map_blocks( xfs_iunlock(ip, lock_flags); error = xfs_iomap_write_direct(ip, offset_fsb, - end_fsb - offset_fsb, 0, &imap); + end_fsb - offset_fsb, 0, &imap, &seq); if (error) goto out_unlock; @@ -209,7 +211,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 238a03087e17..0983dfc9a203 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -49,26 +49,35 @@ struct vm_fault; * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. + * + * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent + * rather than a file data extent. */ -#define IOMAP_F_NEW 0x01 -#define IOMAP_F_DIRTY 0x02 -#define IOMAP_F_SHARED 0x04 -#define IOMAP_F_MERGED 0x08 -#define IOMAP_F_BUFFER_HEAD 0x10 -#define IOMAP_F_ZONE_APPEND 0x20 +#define IOMAP_F_NEW (1U << 0) +#define IOMAP_F_DIRTY (1U << 1) +#define IOMAP_F_SHARED (1U << 2) +#define IOMAP_F_MERGED (1U << 3) +#define IOMAP_F_BUFFER_HEAD (1U << 4) +#define IOMAP_F_ZONE_APPEND (1U << 5) +#define IOMAP_F_XATTR (1U << 6) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. + * + * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file + * range it covers needs to be remapped by the high level before the operation + * can proceed. */ -#define IOMAP_F_SIZE_CHANGED 0x100 +#define IOMAP_F_SIZE_CHANGED (1U << 8) +#define IOMAP_F_STALE (1U << 9) /* * Flags from 0x1000 up are for file system specific usage: */ -#define IOMAP_F_PRIVATE 0x1000 +#define IOMAP_F_PRIVATE (1U << 12) /* @@ -89,6 +98,7 @@ struct iomap { void *inline_data; void *private; /* filesystem private */ const struct iomap_page_ops *page_ops; + u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) @@ -128,6 +138,23 @@ struct iomap_page_ops { int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, struct page *page); + + /* + * Check that the cached iomap still maps correctly to the filesystem's + * internal extent map. FS internal extent maps can change while iomap + * is iterating a cached iomap, so this hook allows iomap to detect that + * the iomap needs to be refreshed during a long running write + * operation. + * + * The filesystem can store internal state (e.g. a sequence number) in + * iomap->validity_cookie when the iomap is first mapped to be able to + * detect changes between mapping time and whenever .iomap_valid() is + * called. + * + * This is called with the folio over the specified file position held + * locked by the iomap code. + */ + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); }; /* @@ -226,6 +253,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, + int (*punch)(struct inode *inode, loff_t pos, loff_t length)); + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);