Merge branch 'bcache' (bcache fixes from Kent Overstreet)

Merge bcache fixes from Kent Overstreet: "There's fixes for _three_ different data corruption bugs, all of which were found by users hitting them in the wild. The first one isn't bcache specific - in 3.11 bcache was switched to the bio_copy_data in fs/bio.c, and that's when the bug in that code was discovered, but it's also used by raid1 and pktcdvd. (That was my code too, so the bug's doubly embarassing given that it was or should've been just a cut and paste from bcache code. Dunno what happened there). Most of these (all the non data corruption bugs, actually) were ready before the merge window and have been sitting in Jens' tree, but I don't know what's been up with him lately..." * emailed patches from Kent Overstreet <kmo@daterainc.com>: bcache: Fix flushes in writeback mode bcache: Fix for handling overlapping extents when reading in a btree node bcache: Fix a shrinker deadlock bcache: Fix a dumb CPU spinning bug in writeback bcache: Fix a flush/fua performance bug bcache: Fix a writeback performance regression bcache: Correct printf()-style format length modifier bcache: Fix for when no journal entries are found bcache: Strip endline when writing the label through sysfs bcache: Fix a dumb journal discard bug block: Fix bio_copy_data()
2025-01-10 15:10:38 +00:00 · 2013-09-24 14:42:03 -07:00 · 2013-09-24 14:42:03 -07:00 · e288e931c1
commit e288e931c1
parent db6aaf4d55 c0f04d88e4
10 changed files with 110 additions and 66 deletions
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@ -498,7 +498,7 @@ struct cached_dev {
 	 */
 	atomic_t		has_dirty;

-	struct ratelimit	writeback_rate;
+	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;

 	/*
@ -507,10 +507,9 @@ struct cached_dev {
 	 */
 	sector_t		last_read;

-	/* Number of writeback bios in flight */
-	atomic_t		in_flight;
+	/* Limit number of writeback bios in flight */
+	struct semaphore	in_flight;
 	struct closure_with_timer writeback;
-	struct closure_waitlist	writeback_wait;

 	struct keybuf		writeback_keys;

--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)

 /* Mergesort */

+static void sort_key_next(struct btree_iter *iter,
+			  struct btree_iter_set *i)
+{
+	i->k = bkey_next(i->k);
+
+	if (i->k == i->end)
+		*i = iter->data[--iter->used];
+}
+
 static void btree_sort_fixup(struct btree_iter *iter)
 {
 	while (iter->used > 1) {
 		struct btree_iter_set *top = iter->data, *i = top + 1;
-		struct bkey *k;

 		if (iter->used > 2 &&
 		    btree_iter_cmp(i[0], i[1]))
 			i++;

-		for (k = i->k;
-		     k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
-		     k = bkey_next(k))
-			if (top->k > i->k)
-				__bch_cut_front(top->k, k);
-			else if (KEY_SIZE(k))
-				bch_cut_back(&START_KEY(k), top->k);
-
-		if (top->k < i->k || k == i->k)
+		if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
 			break;

-		heap_sift(iter, i - top, btree_iter_cmp);
+		if (!KEY_SIZE(i->k)) {
+			sort_key_next(iter, i);
+			heap_sift(iter, i - top, btree_iter_cmp);
+			continue;
+		}
+
+		if (top->k > i->k) {
+			if (bkey_cmp(top->k, i->k) >= 0)
+				sort_key_next(iter, i);
+			else
+				bch_cut_front(top->k, i->k);
+
+			heap_sift(iter, i - top, btree_iter_cmp);
+		} else {
+			/* can't happen because of comparison func */
+			BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+			bch_cut_back(&START_KEY(i->k), top->k);
+		}
 	}
 }

--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b)

 	return;
 err:
-	bch_cache_set_error(b->c, "io error reading bucket %lu",
+	bch_cache_set_error(b->c, "io error reading bucket %zu",
 			    PTR_BUCKET_NR(b->c, &b->key, 0));
 }

@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		return SHRINK_STOP;

 	/* Return -1 if we can't do anything right now */
-	if (sc->gfp_mask & __GFP_WAIT)
+	if (sc->gfp_mask & __GFP_IO)
 		mutex_lock(&c->bucket_lock);
 	else if (!mutex_trylock(&c->bucket_lock))
 		return -1;
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
 		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
 		pr_debug("%u journal buckets", ca->sb.njournal_buckets);

-		/* Read journal buckets ordered by golden ratio hash to quickly
+		/*
+		 * Read journal buckets ordered by golden ratio hash to quickly
 		 * find a sequence of buckets with valid journal entries
 		 */
 		for (i = 0; i < ca->sb.njournal_buckets; i++) {
@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
 				goto bsearch;
 		}

-		/* If that fails, check all the buckets we haven't checked
+		/*
+		 * If that fails, check all the buckets we haven't checked
 		 * already
 		 */
 		pr_debug("falling back to linear search");

-		for (l = 0; l < ca->sb.njournal_buckets; l++) {
-			if (test_bit(l, bitmap))
-				continue;
-
+		for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
+		     l < ca->sb.njournal_buckets;
+		     l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
 			if (read_bucket(l))
 				goto bsearch;
-		}
+
+		if (list_empty(list))
+			continue;
 bsearch:
 		/* Binary search */
 		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
@ -197,10 +200,12 @@ bsearch:
 				r = m;
 		}

-		/* Read buckets in reverse order until we stop finding more
+		/*
+		 * Read buckets in reverse order until we stop finding more
 		 * journal entries
 		 */
-		pr_debug("finishing up");
+		pr_debug("finishing up: m %u njournal_buckets %u",
+			 m, ca->sb.njournal_buckets);
 		l = m;

 		while (1) {
@ -228,9 +233,10 @@ bsearch:
 			}
 	}

-	c->journal.seq = list_entry(list->prev,
-				    struct journal_replay,
-				    list)->j.seq;
+	if (!list_empty(list))
+		c->journal.seq = list_entry(list->prev,
+					    struct journal_replay,
+					    list)->j.seq;

 	return 0;
 #undef read_bucket
@ -428,7 +434,7 @@ static void do_journal_discard(struct cache *ca)
 		return;
 	}

-	switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+	switch (atomic_read(&ja->discard_in_flight)) {
 	case DISCARD_IN_FLIGHT:
 		return;

@ -689,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
 		if (cl)
 			BUG_ON(!closure_wait(&w->wait, cl));

+		closure_flush(&c->journal.io);
 		__journal_try_write(c, true);
 	}
 }
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s)
 	} else {
 		bch_writeback_add(dc);

-		if (s->op.flush_journal) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			/* Also need to send a flush to the backing device */
-			s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-							   dc->disk.bio_split);
+			struct bio *flush = bio_alloc_bioset(0, GFP_NOIO,
+							     dc->disk.bio_split);

-			bio->bi_size = 0;
-			bio->bi_vcnt = 0;
-			closure_bio_submit(bio, cl, s->d);
+			flush->bi_rw	= WRITE_FLUSH;
+			flush->bi_bdev	= bio->bi_bdev;
+			flush->bi_end_io = request_endio;
+			flush->bi_private = cl;
+
+			closure_bio_submit(flush, cl, s->d);
 		} else {
 			s->op.cache_bio = bio;
 		}
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@ -223,8 +223,13 @@ STORE(__cached_dev)
 	}

 	if (attr == &sysfs_label) {
-		/* note: endlines are preserved */
-		memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+		if (size > SB_LABEL_SIZE)
+			return -EINVAL;
+		memcpy(dc->sb.label, buf, size);
+		if (size < SB_LABEL_SIZE)
+			dc->sb.label[size] = '\0';
+		if (size && dc->sb.label[size - 1] == '\n')
+			dc->sb.label[size - 1] = '\0';
 		bch_write_bdev_super(dc, NULL);
 		if (dc->disk.c) {
 			memcpy(dc->disk.c->uuids[dc->disk.id].label,
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 	stats->last = now ?: 1;
 }

-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();

--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
 	(ewma) >> factor;						\
 })

-struct ratelimit {
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
 	uint64_t		next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to bch_next_delay()
+	 */
 	unsigned		rate;
 };

-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
 {
 	d->next = local_clock();
 }

-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);

 #define __DIV_SAFE(n, d, zero)						\
 ({									\
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)

 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
+	uint64_t ret;
+
 	if (atomic_read(&dc->disk.detaching) ||
 	    !dc->writeback_percent)
 		return 0;

-	return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+	ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+	return min_t(uint64_t, ret, HZ);
 }

 /* Background writeback */
@ -208,7 +212,7 @@ normal_refill:

 	up_write(&dc->writeback_lock);

-	ratelimit_reset(&dc->writeback_rate);
+	bch_ratelimit_reset(&dc->writeback_rate);

 	/* Punt to workqueue only so we don't recurse and blow the stack */
 	continue_at(cl, read_dirty, dirty_wq);
@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
 	}

 	bch_keybuf_del(&dc->writeback_keys, w);
-	atomic_dec_bug(&dc->in_flight);
-
-	closure_wake_up(&dc->writeback_wait);
+	up(&dc->in_flight);

 	closure_return_with_destructor(cl, dirty_io_destructor);
 }
@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)

 	closure_bio_submit(&io->bio, cl, &io->dc->disk);

-	continue_at(cl, write_dirty_finish, dirty_wq);
+	continue_at(cl, write_dirty_finish, system_wq);
 }

 static void read_dirty_endio(struct bio *bio, int error)
@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)

 	closure_bio_submit(&io->bio, cl, &io->dc->disk);

-	continue_at(cl, write_dirty, dirty_wq);
+	continue_at(cl, write_dirty, system_wq);
 }

 static void read_dirty(struct closure *cl)
@ -394,12 +396,8 @@ static void read_dirty(struct closure *cl)

 		if (delay > 0 &&
 		    (KEY_START(&w->key) != dc->last_read ||
-		     jiffies_to_msecs(delay) > 50)) {
-			w->private = NULL;
-
-			closure_delay(&dc->writeback, delay);
-			continue_at(cl, read_dirty, dirty_wq);
-		}
+		     jiffies_to_msecs(delay) > 50))
+			delay = schedule_timeout_uninterruptible(delay);

 		dc->last_read	= KEY_OFFSET(&w->key);

@ -424,15 +422,10 @@ static void read_dirty(struct closure *cl)

 		trace_bcache_writeback(&w->key);

-		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+		down(&dc->in_flight);
+		closure_call(&io->cl, read_dirty_submit, NULL, cl);

 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
-		atomic_inc(&dc->in_flight);
-
-		if (!closure_wait_event(&dc->writeback_wait, cl,
-					atomic_read(&dc->in_flight) < 64))
-			continue_at(cl, read_dirty, dirty_wq);
 	}

 	if (0) {
@ -442,7 +435,11 @@ err:
 		bch_keybuf_del(&dc->writeback_keys, w);
 	}

-	refill_dirty(cl);
+	/*
+	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+	 * freed) before refilling again
+	 */
+	continue_at(cl, refill_dirty, dirty_wq);
 }

 /* Init */
@ -484,6 +481,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)

 void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
+	sema_init(&dc->in_flight, 64);
 	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);

@ -513,7 +511,7 @@ void bch_writeback_exit(void)

 int __init bch_writeback_init(void)
 {
-	dirty_wq = create_singlethread_workqueue("bcache_writeback");
+	dirty_wq = create_workqueue("bcache_writeback");
 	if (!dirty_wq)
 		return -ENOMEM;

--- a/fs/bio.c
+++ b/fs/bio.c
@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		src_p = kmap_atomic(src_bv->bv_page);
 		dst_p = kmap_atomic(dst_bv->bv_page);

-		memcpy(dst_p + dst_bv->bv_offset,
-		       src_p + src_bv->bv_offset,
+		memcpy(dst_p + dst_offset,
+		       src_p + src_offset,
 		       bytes);

 		kunmap_atomic(dst_p);