mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-01 02:36:02 +00:00
v6.6-rc4.vfs.fixes
-----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZRKHuAAKCRCRxhvAZXjc ohOLAQDU9Fxq5UdqCdmsyi/b24XJFZlQhcVIZy2Hrhcor9TiVQEAjuECGlxFPSgj atVOWLdugDJquiHextqTEMgIecJpNw4= =uINF -----END PGP SIGNATURE----- Merge tag 'v6.6-rc4.vfs.fixes' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs Pull vfs fixes from Christian Brauner: "This contains the usual miscellaneous fixes and cleanups for vfs and individual fses: Fixes: - Revert ki_pos on error from buffered writes for direct io fallback - Add missing documentation for block device and superblock handling for changes merged this cycle - Fix reiserfs flexible array usage - Ensure that overlayfs sets ctime when setting mtime and atime - Disable deferred caller completions with overlayfs writes until proper support exists Cleanups: - Remove duplicate initialization in pipe code - Annotate aio kioctx_table with __counted_by" * tag 'v6.6-rc4.vfs.fixes' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs: overlayfs: set ctime when setting mtime and atime ntfs3: put resources during ntfs_fill_super() ovl: disable IOCB_DIO_CALLER_COMP porting: document superblock as block device holder porting: document new block device opening order fs/pipe: remove duplicate "offset" initializer fs-writeback: do not requeue a clean inode having skipped pages aio: Annotate struct kioctx_table with __counted_by direct_write_fallback(): on error revert the ->ki_pos update from buffered write reiserfs: Replace 1-element array with C99 style flex-array
This commit is contained in:
commit
84422aee15
@ -949,3 +949,99 @@ mmap_lock held. All in-tree users have been audited and do not seem to
|
||||
depend on the mmap_lock being held, but out of tree users should verify
|
||||
for themselves. If they do need it, they can return VM_FAULT_RETRY to
|
||||
be called with the mmap_lock held.
|
||||
|
||||
---
|
||||
|
||||
**mandatory**
|
||||
|
||||
The order of opening block devices and matching or creating superblocks has
|
||||
changed.
|
||||
|
||||
The old logic opened block devices first and then tried to find a
|
||||
suitable superblock to reuse based on the block device pointer.
|
||||
|
||||
The new logic tries to find a suitable superblock first based on the device
|
||||
number, and opening the block device afterwards.
|
||||
|
||||
Since opening block devices cannot happen under s_umount because of lock
|
||||
ordering requirements s_umount is now dropped while opening block devices and
|
||||
reacquired before calling fill_super().
|
||||
|
||||
In the old logic concurrent mounters would find the superblock on the list of
|
||||
superblocks for the filesystem type. Since the first opener of the block device
|
||||
would hold s_umount they would wait until the superblock became either born or
|
||||
was discarded due to initialization failure.
|
||||
|
||||
Since the new logic drops s_umount concurrent mounters could grab s_umount and
|
||||
would spin. Instead they are now made to wait using an explicit wait-wake
|
||||
mechanism without having to hold s_umount.
|
||||
|
||||
---
|
||||
|
||||
**mandatory**
|
||||
|
||||
The holder of a block device is now the superblock.
|
||||
|
||||
The holder of a block device used to be the file_system_type which wasn't
|
||||
particularly useful. It wasn't possible to go from block device to owning
|
||||
superblock without matching on the device pointer stored in the superblock.
|
||||
This mechanism would only work for a single device so the block layer couldn't
|
||||
find the owning superblock of any additional devices.
|
||||
|
||||
In the old mechanism reusing or creating a superblock for a racing mount(2) and
|
||||
umount(2) relied on the file_system_type as the holder. This was severly
|
||||
underdocumented however:
|
||||
|
||||
(1) Any concurrent mounter that managed to grab an active reference on an
|
||||
existing superblock was made to wait until the superblock either became
|
||||
ready or until the superblock was removed from the list of superblocks of
|
||||
the filesystem type. If the superblock is ready the caller would simple
|
||||
reuse it.
|
||||
|
||||
(2) If the mounter came after deactivate_locked_super() but before
|
||||
the superblock had been removed from the list of superblocks of the
|
||||
filesystem type the mounter would wait until the superblock was shutdown,
|
||||
reuse the block device and allocate a new superblock.
|
||||
|
||||
(3) If the mounter came after deactivate_locked_super() and after
|
||||
the superblock had been removed from the list of superblocks of the
|
||||
filesystem type the mounter would reuse the block device and allocate a new
|
||||
superblock (the bd_holder point may still be set to the filesystem type).
|
||||
|
||||
Because the holder of the block device was the file_system_type any concurrent
|
||||
mounter could open the block devices of any superblock of the same
|
||||
file_system_type without risking seeing EBUSY because the block device was
|
||||
still in use by another superblock.
|
||||
|
||||
Making the superblock the owner of the block device changes this as the holder
|
||||
is now a unique superblock and thus block devices associated with it cannot be
|
||||
reused by concurrent mounters. So a concurrent mounter in (2) could suddenly
|
||||
see EBUSY when trying to open a block device whose holder was a different
|
||||
superblock.
|
||||
|
||||
The new logic thus waits until the superblock and the devices are shutdown in
|
||||
->kill_sb(). Removal of the superblock from the list of superblocks of the
|
||||
filesystem type is now moved to a later point when the devices are closed:
|
||||
|
||||
(1) Any concurrent mounter managing to grab an active reference on an existing
|
||||
superblock is made to wait until the superblock is either ready or until
|
||||
the superblock and all devices are shutdown in ->kill_sb(). If the
|
||||
superblock is ready the caller will simply reuse it.
|
||||
|
||||
(2) If the mounter comes after deactivate_locked_super() but before
|
||||
the superblock has been removed from the list of superblocks of the
|
||||
filesystem type the mounter is made to wait until the superblock and the
|
||||
devices are shut down in ->kill_sb() and the superblock is removed from the
|
||||
list of superblocks of the filesystem type. The mounter will allocate a new
|
||||
superblock and grab ownership of the block device (the bd_holder pointer of
|
||||
the block device will be set to the newly allocated superblock).
|
||||
|
||||
(3) This case is now collapsed into (2) as the superblock is left on the list
|
||||
of superblocks of the filesystem type until all devices are shutdown in
|
||||
->kill_sb(). In other words, if the superblock isn't on the list of
|
||||
superblock of the filesystem type anymore then it has given up ownership of
|
||||
all associated block devices (the bd_holder pointer is NULL).
|
||||
|
||||
As this is a VFS level change it has no practical consequences for filesystems
|
||||
other than that all of them must use one of the provided kill_litter_super(),
|
||||
kill_anon_super(), or kill_block_super() helpers.
|
||||
|
2
fs/aio.c
2
fs/aio.c
@ -80,7 +80,7 @@ struct aio_ring {
|
||||
struct kioctx_table {
|
||||
struct rcu_head rcu;
|
||||
unsigned nr;
|
||||
struct kioctx __rcu *table[];
|
||||
struct kioctx __rcu *table[] __counted_by(nr);
|
||||
};
|
||||
|
||||
struct kioctx_cpu {
|
||||
|
@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
|
||||
|
||||
if (wbc->pages_skipped) {
|
||||
/*
|
||||
* writeback is not making progress due to locked
|
||||
* buffers. Skip this inode for now.
|
||||
* Writeback is not making progress due to locked buffers.
|
||||
* Skip this inode for now. Although having skipped pages
|
||||
* is odd for clean inodes, it can happen for some
|
||||
* filesystems so handle that gracefully.
|
||||
*/
|
||||
redirty_tail_locked(inode, wb);
|
||||
if (inode->i_state & I_DIRTY_ALL)
|
||||
redirty_tail_locked(inode, wb);
|
||||
else
|
||||
inode_cgwb_move_to_attached(inode, wb);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1903,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
|
||||
* We don't know how much we wrote, so just return the number of
|
||||
* bytes which were direct-written
|
||||
*/
|
||||
iocb->ki_pos -= buffered_written;
|
||||
if (direct_written)
|
||||
return direct_written;
|
||||
return err;
|
||||
|
@ -1562,6 +1562,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
|
||||
put_inode_out:
|
||||
iput(inode);
|
||||
out:
|
||||
ntfs3_put_sbi(sbi);
|
||||
kfree(boot2);
|
||||
return err;
|
||||
}
|
||||
|
@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
@ -391,6 +391,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
|
||||
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
|
||||
|
||||
/*
|
||||
* Overlayfs doesn't support deferred completions, don't copy
|
||||
* this property in case it is set by the issuer.
|
||||
*/
|
||||
ifl &= ~IOCB_DIO_CALLER_COMP;
|
||||
|
||||
old_cred = ovl_override_creds(file_inode(file)->i_sb);
|
||||
if (is_sync_kiocb(iocb)) {
|
||||
file_start_write(real.file);
|
||||
|
@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
break;
|
||||
}
|
||||
ret += copied;
|
||||
buf->offset = 0;
|
||||
buf->len = copied;
|
||||
|
||||
if (!iov_iter_count(from))
|
||||
|
@ -2699,7 +2699,7 @@ struct reiserfs_iget_args {
|
||||
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
|
||||
|
||||
#define journal_trans_half(blocksize) \
|
||||
((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
|
||||
((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
|
||||
|
||||
/* journal.c see journal.c for all the comments here */
|
||||
|
||||
@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc {
|
||||
__le32 j_len;
|
||||
|
||||
__le32 j_mount_id; /* mount id of this trans */
|
||||
__le32 j_realblock[1]; /* real locations for each block */
|
||||
__le32 j_realblock[]; /* real locations for each block */
|
||||
};
|
||||
|
||||
#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
|
||||
@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc {
|
||||
struct reiserfs_journal_commit {
|
||||
__le32 j_trans_id; /* must match j_trans_id from the desc block */
|
||||
__le32 j_len; /* ditto */
|
||||
__le32 j_realblock[1]; /* real locations for each block */
|
||||
__le32 j_realblock[]; /* real locations for each block */
|
||||
};
|
||||
|
||||
#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
|
||||
|
Loading…
Reference in New Issue
Block a user