linux-stable/fs/btrfs/extent-io-tree.h
Filipe Manana 2766ff6176 btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.

In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:

  https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html

The cases where this can happen are the following:

-> Case 1

If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).

This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.

If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).

Example reproducer:

  $ cat reproducer-1.sh
  #!/bin/bash

  MNT=/mnt/sdi
  DEV=/dev/sdi

  stat_loop()
  {
      trap "wait; exit" SIGTERM
      local filepath=$1
      local expected=$2
      local got

      while :; do
          got=$(stat -c %b $filepath)
          if [ $got -ne $expected ]; then
             echo -n "ERROR: unexpected used blocks"
             echo " (got: $got expected: $expected)"
          fi
      done
  }

  mkfs.btrfs -f $DEV > /dev/null
  # mkfs.xfs -f $DEV > /dev/null
  # mkfs.ext4 -F $DEV > /dev/null
  # mkfs.f2fs -f $DEV > /dev/null
  # mkfs.reiserfs -f $DEV > /dev/null
  mount $DEV $MNT

  xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
  expected=$(stat -c %b $MNT/foobar)

  # Create a process to keep calling stat(2) on the file and see if the
  # reported number of blocks used (disk space used) changes, it should
  # not because we are not increasing the file size nor punching holes.
  stat_loop $MNT/foobar $expected &
  loop_pid=$!

  for ((i = 0; i < 50000; i++)); do
      xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
  done

  kill $loop_pid &> /dev/null
  wait

  umount $DEV

  $ ./reproducer-1.sh
  ERROR: unexpected used blocks (got: 0 expected: 128)
  ERROR: unexpected used blocks (got: 0 expected: 128)
  (...)

Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.

-> Case 2

If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.

This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.

Example reproducer:

  $ cat reproducer-2.sh
  #!/bin/bash

  MNT=/mnt/sdi
  DEV=/dev/sdi

  stat_loop()
  {
      trap "wait; exit" SIGTERM
      local filepath=$1
      local expected=$2
      local got

      while :; do
          got=$(stat -c %b $filepath)
          if [ $got -ne $expected ]; then
              echo -n "ERROR: unexpected used blocks"
              echo " (got: $got expected: $expected)"
          fi
      done
  }

  mkfs.btrfs -f $DEV > /dev/null
  # mkfs.xfs -f $DEV > /dev/null
  # mkfs.ext4 -F $DEV > /dev/null
  # mkfs.f2fs -f $DEV > /dev/null
  # mkfs.reiserfs -f $DEV > /dev/null
  mount $DEV $MNT

  touch $MNT/foobar
  write_size=$((64 * 1024))
  for ((i = 0; i < 16384; i++)); do
     offset=$(($i * $write_size))
     xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
     blocks_used=$(stat -c %b $MNT/foobar)

     # Fsync the file to trigger writeback and keep calling stat(2) on it
     # to see if the number of blocks used changes.
     stat_loop $MNT/foobar $blocks_used &
     loop_pid=$!
     xfs_io -c "fsync" $MNT/foobar

     kill $loop_pid &> /dev/null
     wait $loop_pid
  done

  umount $DEV

  $ ./reproducer-2.sh
  ERROR: unexpected used blocks (got: 265472 expected: 265344)
  ERROR: unexpected used blocks (got: 284032 expected: 283904)
  (...)

Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.

-> Case 3

Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.

The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.

Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.

Example reproducer:

  $ cat reproducer-3.sh
  #!/bin/bash

  MNT=/mnt/sdi
  DEV=/dev/sdi

  mkfs.btrfs -f $DEV > /dev/null
  # mkfs.xfs -f -m reflink=1 $DEV > /dev/null
  mount $DEV $MNT

  extent_size=$((64 * 1024))
  num_extents=16384
  file_size=$(($extent_size * $num_extents))

  # File foo has many small extents.
  xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
      > /dev/null
  # File bar has much less extents and has exactly the same data as foo.
  xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null

  expected=$(stat -c %b $MNT/foo)

  # Now deduplicate bar into foo. While the deduplication is in progres,
  # the number of used blocks/file size reported by stat should not change
  xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null  &
  dedupe_pid=$!
  while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
      used=$(stat -c %b $MNT/foo)
      if [ $used -ne $expected ]; then
          echo "Unexpected blocks used: $used (expected: $expected)"
      fi
  done

  umount $DEV

  $ ./reproducer-3.sh
  Unexpected blocks used: 2076800 (expected: 2097152)
  Unexpected blocks used: 2097024 (expected: 2097152)
  Unexpected blocks used: 2079872 (expected: 2097152)
  (...)

Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.

So fix this by:

1) Making btrfs_drop_extents() not decrement the VFS inode's number of
   bytes, and instead return the number of bytes;

2) Making any code that drops extents and adds new extents update the
   inode's number of bytes atomically, while holding the btrfs inode's
   spinlock, which is also used by the stat(2) callback to get the inode's
   number of bytes;

3) For ranges in the inode's iotree that are marked as 'delalloc new',
   corresponding to previously unallocated ranges, increment the inode's
   number of bytes when clearing the 'delalloc new' bit from the range,
   in the same critical section that decrements the inode's
   'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.

An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-08 15:54:08 +01:00

264 lines
8.6 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
struct extent_changeset;
struct io_failure_record;
/* Bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
#define EXTENT_UPTODATE (1U << 1)
#define EXTENT_LOCKED (1U << 2)
#define EXTENT_NEW (1U << 3)
#define EXTENT_DELALLOC (1U << 4)
#define EXTENT_DEFRAG (1U << 5)
#define EXTENT_BOUNDARY (1U << 6)
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
/*
* Must be cleared only during ordered extent completion or on error paths if we
* did not manage to submit bios and create the ordered extents for the range.
* Should not be cleared during page release and page invalidation (if there is
* an ordered extent in flight), that is left for the ordered extent completion.
*/
#define EXTENT_DELALLOC_NEW (1U << 14)
/*
* When an ordered extent successfully completes for a region marked as a new
* delalloc range, use this flag when clearing a new delalloc range to indicate
* that the VFS' inode number of bytes should be incremented and the inode's new
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
EXTENT_ADD_INODE_BYTES)
/*
* Redefined bits above which are used only in the device allocation tree,
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
* / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
* manipulation functions
*/
#define CHUNK_ALLOCATED EXTENT_DIRTY
#define CHUNK_TRIMMED EXTENT_DEFRAG
#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
CHUNK_TRIMMED)
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
u64 dirty_bytes;
bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
spinlock_t lock;
};
struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
refcount_t refs;
unsigned state;
struct io_failure_record *failrec;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
int __init extent_state_cache_init(void);
void __cold extent_state_cache_exit(void);
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return lock_extent_bits(tree, start, end, NULL);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int __init extent_io_init(void);
void __cold extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
u64 max_bytes, unsigned bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask,
struct extent_changeset *changeset);
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
}
static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
GFP_NOFS, NULL);
}
static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
u64 start, u64 end, struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, unsigned bits)
{
int wake = 0;
if (bits & EXTENT_LOCKED)
wake = 1;
return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, struct extent_state **cached_state, gfp_t mask);
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits);
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, unsigned bits)
{
return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
u64 end, unsigned int extra_bits,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
/* This should be reworked in the future and put elsewhere. */
struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
int clean_io_failure(struct btrfs_fs_info *fs_info,
struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree, u64 start,
struct page *page, u64 ino, unsigned int pg_offset);
#endif /* BTRFS_EXTENT_IO_TREE_H */