mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 14:43:16 +00:00
a8b3a677e7
This adds support for nocow mode, where we do writes in-place when possible. Patch components: - New boolean filesystem and inode option, nocow: note that when nocow is enabled, data checksumming and compression are implicitly disabled - To prevent in-place writes from racing with data moves (data_update.c) or bucket reuse (i.e. a bucket being reused and re-allocated while a nocow write is in flight, we have a new locking mechanism. Buckets can be locked for either data update or data move, using a fixed size hash table of two_state_shared locks. We don't have any chaining, meaning updates and moves to different buckets that hash to the same lock will wait unnecessarily - we'll want to watch for this becoming an issue. - The allocator path also needs to check for in-place writes in flight to a given bucket before giving it out: thus we add another counter to bucket_alloc_state so we can track this. - Fsync now may need to issue cache flushes to block devices instead of flushing the journal. We add a device bitmask to bch_inode_info, ei_devs_need_flush, which tracks devices that need to have flushes issued - note that this will lead to unnecessary flushes when other codepaths have already issued flushes, we may want to replace this with a sequence number. - New nocow write path: look up extents, and if they're writable write to them - otherwise fall back to the normal COW write path. XXX: switch to sequence numbers instead of bitmask for devs needing journal flush XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to run in process context - see if we can improve this Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
267 lines
6.7 KiB
C
267 lines
6.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _BCACHEFS_SUPER_H
|
|
#define _BCACHEFS_SUPER_H
|
|
|
|
#include "extents.h"
|
|
|
|
#include "bcachefs_ioctl.h"
|
|
|
|
#include <linux/math64.h>
|
|
|
|
static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
|
|
{
|
|
return div_u64(s, ca->mi.bucket_size);
|
|
}
|
|
|
|
static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
|
|
{
|
|
return ((sector_t) b) * ca->mi.bucket_size;
|
|
}
|
|
|
|
static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
|
|
{
|
|
u32 remainder;
|
|
|
|
div_u64_rem(s, ca->mi.bucket_size, &remainder);
|
|
return remainder;
|
|
}
|
|
|
|
static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
|
|
u32 *offset)
|
|
{
|
|
return div_u64_rem(s, ca->mi.bucket_size, offset);
|
|
}
|
|
|
|
static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
|
{
|
|
return !percpu_ref_is_zero(&ca->io_ref);
|
|
}
|
|
|
|
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
|
{
|
|
return bch2_dev_is_online(ca) &&
|
|
ca->mi.state != BCH_MEMBER_STATE_failed;
|
|
}
|
|
|
|
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
|
|
{
|
|
if (!percpu_ref_tryget(&ca->io_ref))
|
|
return false;
|
|
|
|
if (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
(ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
|
|
return true;
|
|
|
|
percpu_ref_put(&ca->io_ref);
|
|
return false;
|
|
}
|
|
|
|
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
|
|
{
|
|
return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
|
|
}
|
|
|
|
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
|
|
unsigned dev)
|
|
{
|
|
unsigned i;
|
|
|
|
for (i = 0; i < devs.nr; i++)
|
|
if (devs.devs[i] == dev)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
|
|
unsigned dev)
|
|
{
|
|
unsigned i;
|
|
|
|
for (i = 0; i < devs->nr; i++)
|
|
if (devs->devs[i] == dev) {
|
|
array_remove_item(devs->devs, devs->nr, i);
|
|
return;
|
|
}
|
|
}
|
|
|
|
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
|
|
unsigned dev)
|
|
{
|
|
if (!bch2_dev_list_has_dev(*devs, dev)) {
|
|
BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
|
|
devs->devs[devs->nr++] = dev;
|
|
}
|
|
}
|
|
|
|
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
|
|
{
|
|
return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
|
|
}
|
|
|
|
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
|
|
const struct bch_devs_mask *mask)
|
|
{
|
|
struct bch_dev *ca = NULL;
|
|
|
|
while ((*iter = mask
|
|
? find_next_bit(mask->d, c->sb.nr_devices, *iter)
|
|
: *iter) < c->sb.nr_devices &&
|
|
!(ca = rcu_dereference_check(c->devs[*iter],
|
|
lockdep_is_held(&c->state_lock))))
|
|
(*iter)++;
|
|
|
|
return ca;
|
|
}
|
|
|
|
#define for_each_member_device_rcu(ca, c, iter, mask) \
|
|
for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
|
|
|
|
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
|
|
{
|
|
struct bch_dev *ca;
|
|
|
|
rcu_read_lock();
|
|
if ((ca = __bch2_next_dev(c, iter, NULL)))
|
|
percpu_ref_get(&ca->ref);
|
|
rcu_read_unlock();
|
|
|
|
return ca;
|
|
}
|
|
|
|
/*
|
|
* If you break early, you must drop your ref on the current device
|
|
*/
|
|
#define for_each_member_device(ca, c, iter) \
|
|
for ((iter) = 0; \
|
|
(ca = bch2_get_next_dev(c, &(iter))); \
|
|
percpu_ref_put(&ca->ref), (iter)++)
|
|
|
|
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
|
|
unsigned *iter,
|
|
int state_mask)
|
|
{
|
|
struct bch_dev *ca;
|
|
|
|
rcu_read_lock();
|
|
while ((ca = __bch2_next_dev(c, iter, NULL)) &&
|
|
(!((1 << ca->mi.state) & state_mask) ||
|
|
!percpu_ref_tryget(&ca->io_ref)))
|
|
(*iter)++;
|
|
rcu_read_unlock();
|
|
|
|
return ca;
|
|
}
|
|
|
|
#define __for_each_online_member(ca, c, iter, state_mask) \
|
|
for ((iter) = 0; \
|
|
(ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
|
|
percpu_ref_put(&ca->io_ref), (iter)++)
|
|
|
|
#define for_each_online_member(ca, c, iter) \
|
|
__for_each_online_member(ca, c, iter, ~0)
|
|
|
|
#define for_each_rw_member(ca, c, iter) \
|
|
__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
|
|
|
|
#define for_each_readable_member(ca, c, iter) \
|
|
__for_each_online_member(ca, c, iter, \
|
|
(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
|
|
|
|
/*
|
|
* If a key exists that references a device, the device won't be going away and
|
|
* we can omit rcu_read_lock():
|
|
*/
|
|
static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
|
|
{
|
|
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
|
|
|
return rcu_dereference_check(c->devs[idx], 1);
|
|
}
|
|
|
|
static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
|
|
{
|
|
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
|
|
|
return rcu_dereference_protected(c->devs[idx],
|
|
lockdep_is_held(&c->sb_lock) ||
|
|
lockdep_is_held(&c->state_lock));
|
|
}
|
|
|
|
/* XXX kill, move to struct bch_fs */
|
|
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
|
|
{
|
|
struct bch_devs_mask devs;
|
|
struct bch_dev *ca;
|
|
unsigned i;
|
|
|
|
memset(&devs, 0, sizeof(devs));
|
|
for_each_online_member(ca, c, i)
|
|
__set_bit(ca->dev_idx, devs.d);
|
|
return devs;
|
|
}
|
|
|
|
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
|
|
{
|
|
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
|
u64 b_offset = bucket_to_sector(ca, b);
|
|
u64 b_end = bucket_to_sector(ca, b + 1);
|
|
unsigned i;
|
|
|
|
if (!b)
|
|
return true;
|
|
|
|
for (i = 0; i < layout->nr_superblocks; i++) {
|
|
u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
|
u64 end = offset + (1 << layout->sb_max_size_bits);
|
|
|
|
if (!(offset >= b_end || end <= b_offset))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
struct bch_fs *bch2_dev_to_fs(dev_t);
|
|
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
|
|
|
|
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
|
|
enum bch_member_state, int);
|
|
int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
|
|
enum bch_member_state, int);
|
|
int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
|
|
enum bch_member_state, int);
|
|
|
|
int bch2_dev_fail(struct bch_dev *, int);
|
|
int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
|
|
int bch2_dev_add(struct bch_fs *, const char *);
|
|
int bch2_dev_online(struct bch_fs *, const char *);
|
|
int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
|
|
int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
|
|
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
|
|
|
|
bool bch2_fs_emergency_read_only(struct bch_fs *);
|
|
void bch2_fs_read_only(struct bch_fs *);
|
|
|
|
int bch2_fs_read_write(struct bch_fs *);
|
|
int bch2_fs_read_write_early(struct bch_fs *);
|
|
|
|
/*
|
|
* Only for use in the recovery/fsck path:
|
|
*/
|
|
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
|
|
{
|
|
if (!test_bit(BCH_FS_RW, &c->flags) &&
|
|
!test_bit(BCH_FS_WAS_RW, &c->flags))
|
|
bch2_fs_read_write_early(c);
|
|
}
|
|
|
|
void __bch2_fs_stop(struct bch_fs *);
|
|
void bch2_fs_free(struct bch_fs *);
|
|
void bch2_fs_stop(struct bch_fs *);
|
|
|
|
int bch2_fs_start(struct bch_fs *);
|
|
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
|
|
|
|
#endif /* _BCACHEFS_SUPER_H */
|