// SPDX-License-Identifier: GPL-2.0 /* * Moving/copying garbage collector * * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" #include "disk_groups.h" #include "extents.h" #include "eytzinger.h" #include "io.h" #include "keylist.h" #include "move.h" #include "movinggc.h" #include "super-io.h" #include "trace.h" #include #include #include #include #include #include /* * We can't use the entire copygc reserve in one iteration of copygc: we may * need the buckets we're freeing up to go back into the copygc reserve to make * forward progress, but if the copygc reserve is full they'll be available for * any allocation - and it's possible that in a given iteration, we free up most * of the buckets we're going to free before we allocate most of the buckets * we're going to allocate. * * If we only use half of the reserve per iteration, then in steady state we'll * always have room in the reserve for the buckets we're going to need in the * next iteration: */ #define COPYGC_BUCKETS_PER_ITER(ca) \ ((ca)->free[RESERVE_MOVINGGC].size / 2) /* * Max sectors to move per iteration: Have to take into account internal * fragmentation from the multiple write points for each generation: */ #define COPYGC_SECTORS_PER_ITER(ca) \ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) { return cmp_int(l.sectors, r.sectors); } static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) { const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; return cmp_int(l->offset, r->offset); } static bool __copygc_pred(struct bch_dev *ca, struct bkey_s_c k) { copygc_heap *h = &ca->copygc_heap; switch (k.k->type) { case KEY_TYPE_extent: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr = bch2_extent_has_device(e, ca->dev_idx); if (ptr) { struct copygc_heap_entry search = { .offset = ptr->offset }; ssize_t i = eytzinger0_find_le(h->data, h->used, sizeof(h->data[0]), bucket_offset_cmp, &search); return (i >= 0 && ptr->offset < h->data[i].offset + ca->mi.bucket_size && ptr->gen == h->data[i].gen); } break; } } return false; } static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { struct bch_dev *ca = arg; if (!__copygc_pred(ca, k)) return DATA_SKIP; data_opts->target = dev_to_target(ca->dev_idx); data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->rewrite_dev = ca->dev_idx; return DATA_REWRITE; } static bool have_copygc_reserve(struct bch_dev *ca) { bool ret; spin_lock(&ca->freelist_lock); ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ca->allocator_state != ALLOCATOR_RUNNING; spin_unlock(&ca->freelist_lock); return ret; } static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) { copygc_heap *h = &ca->copygc_heap; struct copygc_heap_entry e, *i; struct bucket_array *buckets; struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_not_moved = 0; u64 buckets_to_move, buckets_not_moved = 0; size_t b; int ret; memset(&move_stats, 0, sizeof(move_stats)); closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); /* * Find buckets with lowest sector counts, skipping completely * empty buckets, by building a maxheap sorted by sector count, * and repeatedly replacing the maximum element until all * buckets have been visited. */ h->used = 0; /* * We need bucket marks to be up to date - gc can't be recalculating * them: */ down_read(&c->gc_lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { struct bucket_mark m = READ_ONCE(buckets->b[b].mark); struct copygc_heap_entry e; if (m.owned_by_allocator || m.data_type != BCH_DATA_USER || !bucket_sectors_used(m) || bucket_sectors_used(m) >= ca->mi.bucket_size) continue; e = (struct copygc_heap_entry) { .gen = m.gen, .sectors = bucket_sectors_used(m), .offset = bucket_to_sector(ca, b), }; heap_add_or_replace(h, e, -sectors_used_cmp, NULL); } up_read(&ca->bucket_lock); up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) sectors_to_move += i->sectors; while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); sectors_to_move -= e.sectors; } buckets_to_move = h->used; if (!buckets_to_move) return; eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), bucket_offset_cmp, NULL); ret = bch2_move_data(c, &ca->copygc_pd.rate, writepoint_ptr(&ca->copygc_write_point), POS_MIN, POS_MAX, copygc_pred, ca, &move_stats); down_read(&ca->bucket_lock); buckets = bucket_array(ca); for (i = h->data; i < h->data + h->used; i++) { size_t b = sector_to_bucket(ca, i->offset); struct bucket_mark m = READ_ONCE(buckets->b[b].mark); if (i->gen == m.gen && bucket_sectors_used(m)) { sectors_not_moved += bucket_sectors_used(m); buckets_not_moved++; } } up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) bch_warn_ratelimited(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", sectors_not_moved, sectors_to_move, buckets_not_moved, buckets_to_move); trace_copygc(ca, atomic64_read(&move_stats.sectors_moved), sectors_not_moved, buckets_to_move, buckets_not_moved); } static int bch2_copygc_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev_usage usage; unsigned long last; u64 available, fragmented, reserve, next; set_freezable(); while (!kthread_should_stop()) { if (kthread_wait_freezable(c->copy_gc_enabled)) break; last = atomic_long_read(&clock->now); reserve = ca->copygc_threshold; usage = bch2_dev_usage_read(c, ca); available = __dev_buckets_available(ca, usage) * ca->mi.bucket_size; if (available > reserve) { next = last + available - reserve; bch2_kthread_io_clock_wait(clock, next, MAX_SCHEDULE_TIMEOUT); continue; } /* * don't start copygc until there's more than half the copygc * reserve of fragmented space: */ fragmented = usage.sectors_fragmented; if (fragmented < reserve) { next = last + reserve - fragmented; bch2_kthread_io_clock_wait(clock, next, MAX_SCHEDULE_TIMEOUT); continue; } bch2_copygc(c, ca); } return 0; } void bch2_copygc_stop(struct bch_dev *ca) { ca->copygc_pd.rate.rate = UINT_MAX; bch2_ratelimit_reset(&ca->copygc_pd.rate); if (ca->copygc_thread) { kthread_stop(ca->copygc_thread); put_task_struct(ca->copygc_thread); } ca->copygc_thread = NULL; } int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) { struct task_struct *t; BUG_ON(ca->copygc_thread); if (c->opts.nochanges) return 0; if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; t = kthread_create(bch2_copygc_thread, ca, "bch_copygc[%s]", ca->name); if (IS_ERR(t)) return PTR_ERR(t); get_task_struct(t); ca->copygc_thread = t; wake_up_process(ca->copygc_thread); return 0; } void bch2_dev_copygc_init(struct bch_dev *ca) { bch2_pd_controller_init(&ca->copygc_pd); ca->copygc_pd.d_term = 0; }