for-6.0-rc6-tag

-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmMpskIACgkQxWXV+ddt WDtxGA//Z4Z9e0p9CTwBGla9eqflpfPQLya93ANEBqhV/S1wxgvQtj+Q2XpGIqhj AVR4ZqEmnFPmAOay5s/mGQ+wZ3dyR+n/XLZ8XsViXY5yBLnRpZJi8p5ozqYuSm59 1A4FF0ZciD73jql8hPodsd1VFkKqtOTmPFyCxHk2lt/Z36FFYKCUm4P8ALdMxlct 6uEp67PI9Pb6PANq4mj8lpNTnsD2wTKDHqQ3WkHBwuHkEOCVkPbRsBlUkUqpYi0h Lc0XhjcnPX0alfiLFwwNdPZ8vrLE4egktzWA6PqEg1YzBPQQNnuQTHmO25KOqrm1 bW20PGOIF7WFg85w1P20G4I8UdT2CWBEloPSjYTDlD2KTdqBOp95oo7MUQlrDFNm lxns3npylswlvia8nH39iOlwUPL75cDe4U8LkOV+rSHmTmt7B6XK/MfI6sYgmveH V4DUI7BnbfEALbJMsJesHAR/3tnsAPqnLtv+lEF9hM70YXdN2o5iN/D0G/vms3Sr RGVpEFJyJPnzvAg6y3PNTdMEpDtouQHQhHBtPKnfOzRJsgtzk5CTpEBkWPSRLiqm DQj25JdcT8j8Xa8nWppEvogC0hfctqs1ROuZux7KajkxUHEDfXs2l0RR1dEpMvs7 v+Bhw3zLPS0e/b+9HqBSwCo0JAkIWzm6TE00LlKCYsnzNwLZT9k= =4Hu8 -----END PGP SIGNATURE----- Merge tag 'for-6.0-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: - two fixes for hangs in the umount sequence where threads depend on each other and the work must be finished in the right order - in zoned mode, wait for flushing all block group metadata IO before finishing the zone * tag 'for-6.0-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: zoned: wait for extent buffer IOs before finishing a zone btrfs: fix hang during unmount when stopping a space reclaim worker btrfs: fix hang during unmount when stopping block group reclaim worker
2024-12-29 09:12:07 +00:00 · 2022-09-20 10:23:24 -07:00 · 2022-09-20 10:23:24 -07:00 · 60891ec99e
commit 60891ec99e
parent 84a3193883 2dd7e7bc02
2 changed files with 74 additions and 8 deletions
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@ -4474,6 +4474,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)

 	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);

+	/*
+	 * If we had UNFINISHED_DROPS we could still be processing them, so
+	 * clear that bit and wake up relocation so it can stop.
+	 * We must do this before stopping the block group reclaim task, because
+	 * at btrfs_relocate_block_group() we wait for this bit, and after the
+	 * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+	 * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+	 * return 1.
+	 */
+	btrfs_wake_unfinished_drop(fs_info);
+
 	/*
 	 * We may have the reclaim task running and relocating a data block group,
 	 * in which case it may create delayed iputs. So stop it before we park
@ -4492,12 +4503,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	 */
 	kthread_park(fs_info->cleaner_kthread);

-	/*
-	 * If we had UNFINISHED_DROPS we could still be processing them, so
-	 * clear that bit and wake up relocation so it can stop.
-	 */
-	btrfs_wake_unfinished_drop(fs_info);
-
 	/* wait for the qgroup rescan worker to stop */
 	btrfs_qgroup_wait_for_completion(fs_info, false);

@ -4520,6 +4525,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_cleanup_defrag_inodes(fs_info);

+	/*
+	 * After we parked the cleaner kthread, ordered extents may have
+	 * completed and created new delayed iputs. If one of the async reclaim
+	 * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+	 * can hang forever trying to stop it, because if a delayed iput is
+	 * added after it ran btrfs_run_delayed_iputs() and before it called
+	 * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+	 * no one else to run iputs.
+	 *
+	 * So wait for all ongoing ordered extents to complete and then run
+	 * delayed iputs. This works because once we reach this point no one
+	 * can either create new ordered extents nor create delayed iputs
+	 * through some other means.
+	 *
+	 * Also note that btrfs_wait_ordered_roots() is not safe here, because
+	 * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+	 * but the delayed iput for the respective inode is made only when doing
+	 * the final btrfs_put_ordered_extent() (which must happen at
+	 * btrfs_finish_ordered_io() when we are unmounting).
+	 */
+	btrfs_flush_workqueue(fs_info->endio_write_workers);
+	/* Ordered extents for free space inodes. */
+	btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+	btrfs_run_delayed_iputs(fs_info);
+
 	cancel_work_sync(&fs_info->async_reclaim_work);
 	cancel_work_sync(&fs_info->async_data_reclaim_work);
 	cancel_work_sync(&fs_info->preempt_reclaim_work);
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@ -1918,10 +1918,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 	return ret;
 }

+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	const u64 end = block_group->start + block_group->length;
+	struct radix_tree_iter iter;
+	struct extent_buffer *eb;
+	void __rcu **slot;
+
+	rcu_read_lock();
+	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+				 block_group->start >> fs_info->sectorsize_bits) {
+		eb = radix_tree_deref_slot(slot);
+		if (!eb)
+			continue;
+		if (radix_tree_deref_retry(eb)) {
+			slot = radix_tree_iter_retry(&iter);
+			continue;
+		}
+
+		if (eb->start < block_group->start)
+			continue;
+		if (eb->start >= end)
+			break;
+
+		slot = radix_tree_iter_resume(slot, &iter);
+		rcu_read_unlock();
+		wait_on_extent_buffer_writeback(eb);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct map_lookup *map;
+	const bool is_metadata = (block_group->flags &
+			(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
 	int ret = 0;
 	int i;

@ -1932,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 	}

 	/* Check if we have unwritten allocated space */
-	if ((block_group->flags &
-	     (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+	if (is_metadata &&
 	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
 		spin_unlock(&block_group->lock);
 		return -EAGAIN;
@ -1958,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
 		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
 					 block_group->length);
+		/* Wait for extent buffers to be written. */
+		if (is_metadata)
+			wait_eb_writebacks(block_group);

 		spin_lock(&block_group->lock);