2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
|
|
|
|
*
|
|
|
|
* This file is released under the GPL.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/device-mapper.h>
|
2009-01-06 03:04:54 +00:00
|
|
|
#include <linux/delay.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/kdev_t.h>
|
|
|
|
#include <linux/list.h>
|
2019-03-17 14:22:57 +02:00
|
|
|
#include <linux/list_bl.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/mempool.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2007-10-19 22:38:44 +01:00
|
|
|
#include <linux/log2.h>
|
2008-04-24 22:02:01 +01:00
|
|
|
#include <linux/dm-kcopyd.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2015-02-26 11:40:35 -05:00
|
|
|
#include "dm.h"
|
|
|
|
|
2009-01-06 03:05:15 +00:00
|
|
|
#include "dm-exception-store.h"
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-06-26 00:27:35 -07:00
|
|
|
#define DM_MSG_PREFIX "snapshots"
|
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
|
|
|
|
|
|
|
|
#define dm_target_is_snapshot_merge(ti) \
|
|
|
|
((ti)->type->name == dm_snapshot_merge_target_name)
|
|
|
|
|
2008-07-21 12:00:32 +01:00
|
|
|
/*
|
|
|
|
* The size of the mempool used to track chunks in use.
|
|
|
|
*/
|
|
|
|
#define MIN_IOS 256
|
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
#define DM_TRACKED_CHUNK_HASH_SIZE 16
|
|
|
|
#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
|
|
|
|
(DM_TRACKED_CHUNK_HASH_SIZE - 1))
|
|
|
|
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception_table {
|
2009-04-02 19:55:34 +01:00
|
|
|
uint32_t hash_mask;
|
|
|
|
unsigned hash_shift;
|
2019-03-17 14:22:57 +02:00
|
|
|
struct hlist_bl_head *table;
|
2009-04-02 19:55:34 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct dm_snapshot {
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
struct rw_semaphore lock;
|
2009-04-02 19:55:34 +01:00
|
|
|
|
|
|
|
struct dm_dev *origin;
|
2009-12-10 23:52:12 +00:00
|
|
|
struct dm_dev *cow;
|
|
|
|
|
|
|
|
struct dm_target *ti;
|
2009-04-02 19:55:34 +01:00
|
|
|
|
|
|
|
/* List of snapshots per Origin */
|
|
|
|
struct list_head list;
|
|
|
|
|
2009-12-10 23:52:35 +00:00
|
|
|
/*
|
|
|
|
* You can't use a snapshot if this is 0 (e.g. if full).
|
|
|
|
* A snapshot-merge target never clears this.
|
|
|
|
*/
|
2009-04-02 19:55:34 +01:00
|
|
|
int valid;
|
|
|
|
|
2015-06-21 16:31:33 -04:00
|
|
|
/*
|
|
|
|
* The snapshot overflowed because of a write to the snapshot device.
|
|
|
|
* We don't have to invalidate the snapshot in this case, but we need
|
|
|
|
* to prevent further writes.
|
|
|
|
*/
|
|
|
|
int snapshot_overflowed;
|
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
/* Origin writes don't trigger exceptions until this is set */
|
|
|
|
int active;
|
|
|
|
|
|
|
|
atomic_t pending_exceptions_count;
|
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
spinlock_t pe_allocation_lock;
|
|
|
|
|
|
|
|
/* Protected by "pe_allocation_lock" */
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
sector_t exception_start_sequence;
|
|
|
|
|
|
|
|
/* Protected by kcopyd single-threaded callback */
|
|
|
|
sector_t exception_complete_sequence;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A list of pending exceptions that completed out of order.
|
|
|
|
* Protected by kcopyd single-threaded callback.
|
|
|
|
*/
|
2018-08-07 16:56:00 -04:00
|
|
|
struct rb_root out_of_order_tree;
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
|
2018-05-20 18:25:53 -04:00
|
|
|
mempool_t pending_pool;
|
2010-03-06 02:32:33 +00:00
|
|
|
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception_table pending;
|
|
|
|
struct dm_exception_table complete;
|
2009-04-02 19:55:34 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* pe_lock protects all pending_exception operations and access
|
|
|
|
* as well as the snapshot_bios list.
|
|
|
|
*/
|
|
|
|
spinlock_t pe_lock;
|
|
|
|
|
2010-03-06 02:32:33 +00:00
|
|
|
/* Chunks with outstanding reads */
|
|
|
|
spinlock_t tracked_chunk_lock;
|
|
|
|
struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
|
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
/* The on disk metadata handler */
|
|
|
|
struct dm_exception_store *store;
|
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
unsigned in_progress;
|
|
|
|
struct wait_queue_head in_progress_wait;
|
dm snapshot: Fix excessive memory usage and workqueue stalls
kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:
[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964] dump_stack+0x7d/0xbb
[463.492973] dump_header+0x6b/0x2fc
[463.492987] ? lockdep_hardirqs_on+0xee/0x190
[463.493012] oom_kill_process+0x302/0x370
[463.493021] out_of_memory+0x113/0x560
[463.493030] __alloc_pages_slowpath+0xf40/0x1020
[463.493055] __alloc_pages_nodemask+0x348/0x3c0
[463.493067] cache_grow_begin+0x81/0x8b0
[463.493072] ? cache_grow_begin+0x874/0x8b0
[463.493078] fallback_alloc+0x1e4/0x280
[463.493092] kmem_cache_alloc_node+0xd6/0x370
[463.493098] ? copy_process.part.31+0x1c5/0x20d0
[463.493105] copy_process.part.31+0x1c5/0x20d0
[463.493115] ? __lock_acquire+0x3cc/0x1550
[463.493121] ? __switch_to_asm+0x34/0x70
[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135] ? finish_task_switch+0x90/0x280
[463.493165] _do_fork+0xe0/0x6d0
[463.493191] ? kthreadd+0x19f/0x220
[463.493233] kernel_thread+0x25/0x30
[463.493235] kthreadd+0x1bf/0x220
[463.493242] ? kthread_create_on_cpu+0x90/0x90
[463.493248] ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285] free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name Used Total
[463.493522] bio-6 1028KB 1028KB
[463.493525] bio-5 1028KB 1028KB
[463.493528] dm_snap_pending_exception 236783KB 243789KB
[463.493531] dm_exception 41KB 42KB
[463.493534] bio-4 1216KB 1216KB
[463.493537] bio-3 439396KB 439396KB
[463.493539] kcopyd_job 6973427KB 6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:
[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611] pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656] pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698] pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768] pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817] pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848] pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896] pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935] in-flight: 67:do_work [dm_mod]
[67501.195945] pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.
Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().
The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.
A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.
Re-running the aforementioned test:
* Workqueue stalls are eliminated
* kcopyd's job slab cache uses a maximum of 130MB
* The time taken by the test to write to the snapshot-origin target is
reduced from 05m20.48s to 03m26.38s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2018-10-31 17:53:08 -04:00
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
struct dm_kcopyd_client *kcopyd_client;
|
|
|
|
|
2010-03-06 02:32:33 +00:00
|
|
|
/* Wait for events based on state_bits */
|
|
|
|
unsigned long state_bits;
|
|
|
|
|
|
|
|
/* Range of chunks currently being merged. */
|
|
|
|
chunk_t first_merging_chunk;
|
|
|
|
int num_merging_chunks;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
2009-12-10 23:52:35 +00:00
|
|
|
/*
|
|
|
|
* The merge operation failed if this flag is set.
|
|
|
|
* Failure modes are handled as follows:
|
|
|
|
* - I/O error reading the header
|
|
|
|
* => don't load the target; abort.
|
|
|
|
* - Header does not have "valid" flag set
|
|
|
|
* => use the origin; forget about the snapshot.
|
|
|
|
* - I/O error when reading exceptions
|
|
|
|
* => don't load the target; abort.
|
|
|
|
* (We can't use the intermediate origin state.)
|
|
|
|
* - I/O error while merging
|
|
|
|
* => stop merging; set merge_failed; process I/O normally.
|
|
|
|
*/
|
2019-06-19 17:05:54 -04:00
|
|
|
bool merge_failed:1;
|
|
|
|
|
|
|
|
bool discard_zeroes_cow:1;
|
|
|
|
bool discard_passdown_origin:1;
|
2009-12-10 23:52:35 +00:00
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
/*
|
|
|
|
* Incoming bios that overlap with chunks being merged must wait
|
|
|
|
* for them to be committed.
|
|
|
|
*/
|
|
|
|
struct bio_list bios_queued_during_merge;
|
2009-04-02 19:55:34 +01:00
|
|
|
};
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
/*
|
|
|
|
* state_bits:
|
|
|
|
* RUNNING_MERGE - Merge operation is in progress.
|
|
|
|
* SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
|
|
|
|
* cleared afterwards.
|
|
|
|
*/
|
|
|
|
#define RUNNING_MERGE 0
|
|
|
|
#define SHUTDOWN_MERGE 1
|
|
|
|
|
dm snapshot: Fix excessive memory usage and workqueue stalls
kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:
[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964] dump_stack+0x7d/0xbb
[463.492973] dump_header+0x6b/0x2fc
[463.492987] ? lockdep_hardirqs_on+0xee/0x190
[463.493012] oom_kill_process+0x302/0x370
[463.493021] out_of_memory+0x113/0x560
[463.493030] __alloc_pages_slowpath+0xf40/0x1020
[463.493055] __alloc_pages_nodemask+0x348/0x3c0
[463.493067] cache_grow_begin+0x81/0x8b0
[463.493072] ? cache_grow_begin+0x874/0x8b0
[463.493078] fallback_alloc+0x1e4/0x280
[463.493092] kmem_cache_alloc_node+0xd6/0x370
[463.493098] ? copy_process.part.31+0x1c5/0x20d0
[463.493105] copy_process.part.31+0x1c5/0x20d0
[463.493115] ? __lock_acquire+0x3cc/0x1550
[463.493121] ? __switch_to_asm+0x34/0x70
[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135] ? finish_task_switch+0x90/0x280
[463.493165] _do_fork+0xe0/0x6d0
[463.493191] ? kthreadd+0x19f/0x220
[463.493233] kernel_thread+0x25/0x30
[463.493235] kthreadd+0x1bf/0x220
[463.493242] ? kthread_create_on_cpu+0x90/0x90
[463.493248] ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285] free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name Used Total
[463.493522] bio-6 1028KB 1028KB
[463.493525] bio-5 1028KB 1028KB
[463.493528] dm_snap_pending_exception 236783KB 243789KB
[463.493531] dm_exception 41KB 42KB
[463.493534] bio-4 1216KB 1216KB
[463.493537] bio-3 439396KB 439396KB
[463.493539] kcopyd_job 6973427KB 6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:
[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611] pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656] pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698] pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768] pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817] pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848] pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896] pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935] in-flight: 67:do_work [dm_mod]
[67501.195945] pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.
Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().
The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.
A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.
Re-running the aforementioned test:
* Workqueue stalls are eliminated
* kcopyd's job slab cache uses a maximum of 130MB
* The time taken by the test to write to the snapshot-origin target is
reduced from 05m20.48s to 03m26.38s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2018-10-31 17:53:08 -04:00
|
|
|
/*
|
|
|
|
* Maximum number of chunks being copied on write.
|
|
|
|
*
|
|
|
|
* The value was decided experimentally as a trade-off between memory
|
|
|
|
* consumption, stalling the kernel's workqueues and maintaining a high enough
|
|
|
|
* throughput.
|
|
|
|
*/
|
|
|
|
#define DEFAULT_COW_THRESHOLD 2048
|
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
|
|
|
|
module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
|
dm snapshot: Fix excessive memory usage and workqueue stalls
kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:
[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964] dump_stack+0x7d/0xbb
[463.492973] dump_header+0x6b/0x2fc
[463.492987] ? lockdep_hardirqs_on+0xee/0x190
[463.493012] oom_kill_process+0x302/0x370
[463.493021] out_of_memory+0x113/0x560
[463.493030] __alloc_pages_slowpath+0xf40/0x1020
[463.493055] __alloc_pages_nodemask+0x348/0x3c0
[463.493067] cache_grow_begin+0x81/0x8b0
[463.493072] ? cache_grow_begin+0x874/0x8b0
[463.493078] fallback_alloc+0x1e4/0x280
[463.493092] kmem_cache_alloc_node+0xd6/0x370
[463.493098] ? copy_process.part.31+0x1c5/0x20d0
[463.493105] copy_process.part.31+0x1c5/0x20d0
[463.493115] ? __lock_acquire+0x3cc/0x1550
[463.493121] ? __switch_to_asm+0x34/0x70
[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135] ? finish_task_switch+0x90/0x280
[463.493165] _do_fork+0xe0/0x6d0
[463.493191] ? kthreadd+0x19f/0x220
[463.493233] kernel_thread+0x25/0x30
[463.493235] kthreadd+0x1bf/0x220
[463.493242] ? kthread_create_on_cpu+0x90/0x90
[463.493248] ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285] free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name Used Total
[463.493522] bio-6 1028KB 1028KB
[463.493525] bio-5 1028KB 1028KB
[463.493528] dm_snap_pending_exception 236783KB 243789KB
[463.493531] dm_exception 41KB 42KB
[463.493534] bio-4 1216KB 1216KB
[463.493537] bio-3 439396KB 439396KB
[463.493539] kcopyd_job 6973427KB 6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:
[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611] pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656] pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698] pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768] pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817] pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848] pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896] pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935] in-flight: 67:do_work [dm_mod]
[67501.195945] pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.
Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().
The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.
A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.
Re-running the aforementioned test:
* Workqueue stalls are eliminated
* kcopyd's job slab cache uses a maximum of 130MB
* The time taken by the test to write to the snapshot-origin target is
reduced from 05m20.48s to 03m26.38s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2018-10-31 17:53:08 -04:00
|
|
|
MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
|
|
|
|
|
2013-03-01 22:45:49 +00:00
|
|
|
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
|
|
|
|
"A percentage of time allocated for copy on write");
|
|
|
|
|
2010-08-12 04:13:51 +01:00
|
|
|
struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
return s->origin;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dm_snap_origin);
|
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
return s->cow;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dm_snap_cow);
|
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
static sector_t chunk_to_sector(struct dm_exception_store *store,
|
|
|
|
chunk_t chunk)
|
|
|
|
{
|
|
|
|
return chunk << store->chunk_shift;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* There is only ever one instance of a particular block
|
|
|
|
* device so we can compare pointers safely.
|
|
|
|
*/
|
|
|
|
return lhs == rhs;
|
|
|
|
}
|
|
|
|
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snap_pending_exception {
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception e;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Origin buffers waiting for this to complete are held
|
|
|
|
* in a bio list
|
|
|
|
*/
|
|
|
|
struct bio_list origin_bios;
|
|
|
|
struct bio_list snapshot_bios;
|
|
|
|
|
|
|
|
/* Pointer back to snapshot context */
|
|
|
|
struct dm_snapshot *snap;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 1 indicates the exception has already been sent to
|
|
|
|
* kcopyd.
|
|
|
|
*/
|
|
|
|
int started;
|
2011-08-02 12:32:04 +01:00
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
/* There was copying error. */
|
|
|
|
int copy_error;
|
|
|
|
|
|
|
|
/* A sequence number, it is used for in-order completion. */
|
|
|
|
sector_t exception_sequence;
|
|
|
|
|
2018-08-07 16:56:00 -04:00
|
|
|
struct rb_node out_of_order_node;
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
|
2011-08-02 12:32:04 +01:00
|
|
|
/*
|
|
|
|
* For writing a complete chunk, bypassing the copy.
|
|
|
|
*/
|
|
|
|
struct bio *full_bio;
|
|
|
|
bio_end_io_t *full_bio_end_io;
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hash table mapping origin volumes to lists of snapshots and
|
|
|
|
* a lock to protect it
|
|
|
|
*/
|
2006-12-06 20:33:20 -08:00
|
|
|
static struct kmem_cache *exception_cache;
|
|
|
|
static struct kmem_cache *pending_cache;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-07-21 12:00:32 +01:00
|
|
|
struct dm_snap_tracked_chunk {
|
|
|
|
struct hlist_node node;
|
|
|
|
chunk_t chunk;
|
|
|
|
};
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
static void init_tracked_chunk(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
|
|
|
|
INIT_HLIST_NODE(&c->node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool is_bio_tracked(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
|
|
|
|
return !hlist_unhashed(&c->node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
|
2008-07-21 12:00:32 +01:00
|
|
|
{
|
2012-12-21 20:23:38 +00:00
|
|
|
struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
|
2008-07-21 12:00:32 +01:00
|
|
|
|
|
|
|
c->chunk = chunk;
|
|
|
|
|
2012-12-21 20:23:33 +00:00
|
|
|
spin_lock_irq(&s->tracked_chunk_lock);
|
2008-07-21 12:00:32 +01:00
|
|
|
hlist_add_head(&c->node,
|
|
|
|
&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
|
2012-12-21 20:23:33 +00:00
|
|
|
spin_unlock_irq(&s->tracked_chunk_lock);
|
2008-07-21 12:00:32 +01:00
|
|
|
}
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
|
2008-07-21 12:00:32 +01:00
|
|
|
{
|
2012-12-21 20:23:41 +00:00
|
|
|
struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
|
2008-07-21 12:00:32 +01:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&s->tracked_chunk_lock, flags);
|
|
|
|
hlist_del(&c->node);
|
|
|
|
spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
|
|
|
|
}
|
|
|
|
|
2008-07-21 12:00:34 +01:00
|
|
|
static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
|
|
|
|
{
|
|
|
|
struct dm_snap_tracked_chunk *c;
|
|
|
|
int found = 0;
|
|
|
|
|
|
|
|
spin_lock_irq(&s->tracked_chunk_lock);
|
|
|
|
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
|
|
|
hlist_for_each_entry(c,
|
2008-07-21 12:00:34 +01:00
|
|
|
&s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
|
|
|
|
if (c->chunk == chunk) {
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock_irq(&s->tracked_chunk_lock);
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:29 +00:00
|
|
|
/*
|
|
|
|
* This conflicting I/O is extremely improbable in the caller,
|
|
|
|
* so msleep(1) is sufficient and there is no need for a wait queue.
|
|
|
|
*/
|
|
|
|
static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
|
|
|
|
{
|
|
|
|
while (__chunk_is_tracked(s, chunk))
|
|
|
|
msleep(1);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* One of these per registered origin, held in the snapshot_origins hash
|
|
|
|
*/
|
|
|
|
struct origin {
|
|
|
|
/* The origin device */
|
|
|
|
struct block_device *bdev;
|
|
|
|
|
|
|
|
struct list_head hash_list;
|
|
|
|
|
|
|
|
/* List of snapshots for this origin */
|
|
|
|
struct list_head snapshots;
|
|
|
|
};
|
|
|
|
|
2015-02-26 11:40:35 -05:00
|
|
|
/*
|
|
|
|
* This structure is allocated for each origin target
|
|
|
|
*/
|
|
|
|
struct dm_origin {
|
|
|
|
struct dm_dev *dev;
|
|
|
|
struct dm_target *ti;
|
|
|
|
unsigned split_boundary;
|
|
|
|
struct list_head hash_list;
|
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Size of the hash table for origin volumes. If we make this
|
|
|
|
* the size of the minors list then it should be nearly perfect
|
|
|
|
*/
|
|
|
|
#define ORIGIN_HASH_SIZE 256
|
|
|
|
#define ORIGIN_MASK 0xFF
|
|
|
|
static struct list_head *_origins;
|
2015-02-26 11:40:35 -05:00
|
|
|
static struct list_head *_dm_origins;
|
2005-04-16 15:20:36 -07:00
|
|
|
static struct rw_semaphore _origins_lock;
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
|
|
|
|
static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
|
|
|
|
static uint64_t _pending_exceptions_done_count;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static int init_origin_hash(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 13:55:00 -07:00
|
|
|
_origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
|
|
|
|
GFP_KERNEL);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!_origins) {
|
2015-02-26 11:40:35 -05:00
|
|
|
DMERR("unable to allocate memory for _origins");
|
2005-04-16 15:20:36 -07:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
|
|
|
|
INIT_LIST_HEAD(_origins + i);
|
2015-02-26 11:40:35 -05:00
|
|
|
|
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 13:55:00 -07:00
|
|
|
_dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
|
|
|
|
sizeof(struct list_head),
|
|
|
|
GFP_KERNEL);
|
2015-02-26 11:40:35 -05:00
|
|
|
if (!_dm_origins) {
|
|
|
|
DMERR("unable to allocate memory for _dm_origins");
|
|
|
|
kfree(_origins);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
|
|
|
|
INIT_LIST_HEAD(_dm_origins + i);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
init_rwsem(&_origins_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void exit_origin_hash(void)
|
|
|
|
{
|
|
|
|
kfree(_origins);
|
2015-02-26 11:40:35 -05:00
|
|
|
kfree(_dm_origins);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-07-12 17:26:32 +01:00
|
|
|
static unsigned origin_hash(struct block_device *bdev)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return bdev->bd_dev & ORIGIN_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct origin *__lookup_origin(struct block_device *origin)
|
|
|
|
{
|
|
|
|
struct list_head *ol;
|
|
|
|
struct origin *o;
|
|
|
|
|
|
|
|
ol = &_origins[origin_hash(origin)];
|
|
|
|
list_for_each_entry (o, ol, hash_list)
|
|
|
|
if (bdev_equal(o->bdev, origin))
|
|
|
|
return o;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __insert_origin(struct origin *o)
|
|
|
|
{
|
|
|
|
struct list_head *sl = &_origins[origin_hash(o->bdev)];
|
|
|
|
list_add_tail(&o->hash_list, sl);
|
|
|
|
}
|
|
|
|
|
2015-02-26 11:40:35 -05:00
|
|
|
static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
|
|
|
|
{
|
|
|
|
struct list_head *ol;
|
|
|
|
struct dm_origin *o;
|
|
|
|
|
|
|
|
ol = &_dm_origins[origin_hash(origin)];
|
|
|
|
list_for_each_entry (o, ol, hash_list)
|
|
|
|
if (bdev_equal(o->dev->bdev, origin))
|
|
|
|
return o;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __insert_dm_origin(struct dm_origin *o)
|
|
|
|
{
|
|
|
|
struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
|
|
|
|
list_add_tail(&o->hash_list, sl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __remove_dm_origin(struct dm_origin *o)
|
|
|
|
{
|
|
|
|
list_del(&o->hash_list);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
/*
|
|
|
|
* _origins_lock must be held when calling this function.
|
|
|
|
* Returns number of snapshots registered using the supplied cow device, plus:
|
|
|
|
* snap_src - a snapshot suitable for use as a source of exception handover
|
|
|
|
* snap_dest - a snapshot capable of receiving exception handover.
|
2009-12-10 23:52:32 +00:00
|
|
|
* snap_merge - an existing snapshot-merge target linked to the same origin.
|
|
|
|
* There can be at most one snapshot-merge target. The parameter is optional.
|
2009-12-10 23:52:24 +00:00
|
|
|
*
|
2009-12-10 23:52:32 +00:00
|
|
|
* Possible return values and states of snap_src and snap_dest.
|
2009-12-10 23:52:24 +00:00
|
|
|
* 0: NULL, NULL - first new snapshot
|
|
|
|
* 1: snap_src, NULL - normal snapshot
|
|
|
|
* 2: snap_src, snap_dest - waiting for handover
|
|
|
|
* 2: snap_src, NULL - handed over, waiting for old to be deleted
|
|
|
|
* 1: NULL, snap_dest - source got destroyed without handover
|
|
|
|
*/
|
|
|
|
static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
|
|
|
|
struct dm_snapshot **snap_src,
|
2009-12-10 23:52:32 +00:00
|
|
|
struct dm_snapshot **snap_dest,
|
|
|
|
struct dm_snapshot **snap_merge)
|
2009-12-10 23:52:24 +00:00
|
|
|
{
|
|
|
|
struct dm_snapshot *s;
|
|
|
|
struct origin *o;
|
|
|
|
int count = 0;
|
|
|
|
int active;
|
|
|
|
|
|
|
|
o = __lookup_origin(snap->origin->bdev);
|
|
|
|
if (!o)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
list_for_each_entry(s, &o->snapshots, list) {
|
2009-12-10 23:52:32 +00:00
|
|
|
if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
|
|
|
|
*snap_merge = s;
|
2009-12-10 23:52:24 +00:00
|
|
|
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
|
|
|
|
continue;
|
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_read(&s->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
active = s->active;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_read(&s->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
|
|
|
|
if (active) {
|
|
|
|
if (snap_src)
|
|
|
|
*snap_src = s;
|
|
|
|
} else if (snap_dest)
|
|
|
|
*snap_dest = s;
|
|
|
|
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On success, returns 1 if this snapshot is a handover destination,
|
|
|
|
* otherwise returns 0.
|
|
|
|
*/
|
|
|
|
static int __validate_exception_handover(struct dm_snapshot *snap)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
|
2009-12-10 23:52:32 +00:00
|
|
|
struct dm_snapshot *snap_merge = NULL;
|
2009-12-10 23:52:24 +00:00
|
|
|
|
|
|
|
/* Does snapshot need exceptions handed over to it? */
|
2009-12-10 23:52:32 +00:00
|
|
|
if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
|
|
|
|
&snap_merge) == 2) ||
|
2009-12-10 23:52:24 +00:00
|
|
|
snap_dest) {
|
|
|
|
snap->ti->error = "Snapshot cow pairing for exception "
|
|
|
|
"table handover failed";
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If no snap_src was found, snap cannot become a handover
|
|
|
|
* destination.
|
|
|
|
*/
|
|
|
|
if (!snap_src)
|
|
|
|
return 0;
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
/*
|
|
|
|
* Non-snapshot-merge handover?
|
|
|
|
*/
|
|
|
|
if (!dm_target_is_snapshot_merge(snap->ti))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not allow more than one merging snapshot.
|
|
|
|
*/
|
|
|
|
if (snap_merge) {
|
|
|
|
snap->ti->error = "A snapshot is already merging.";
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
if (!snap_src->store->type->prepare_merge ||
|
|
|
|
!snap_src->store->type->commit_merge) {
|
|
|
|
snap->ti->error = "Snapshot exception store does not "
|
|
|
|
"support snapshot-merge.";
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *l;
|
|
|
|
|
|
|
|
/* Sort the list according to chunk size, largest-first smallest-last */
|
|
|
|
list_for_each_entry(l, &o->snapshots, list)
|
|
|
|
if (l->store->chunk_size < s->store->chunk_size)
|
|
|
|
break;
|
|
|
|
list_add_tail(&s->list, &l->list);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Make a note of the snapshot and its origin so we can look it
|
|
|
|
* up when the origin has a write on it.
|
2009-12-10 23:52:24 +00:00
|
|
|
*
|
|
|
|
* Also validate snapshot exception store handovers.
|
|
|
|
* On success, returns 1 if this registration is a handover destination,
|
|
|
|
* otherwise returns 0.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
static int register_snapshot(struct dm_snapshot *snap)
|
|
|
|
{
|
2009-12-10 23:52:24 +00:00
|
|
|
struct origin *o, *new_o = NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct block_device *bdev = snap->origin->bdev;
|
2009-12-10 23:52:24 +00:00
|
|
|
int r = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-10-30 13:33:12 +00:00
|
|
|
new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
|
|
|
|
if (!new_o)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
down_write(&_origins_lock);
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
r = __validate_exception_handover(snap);
|
|
|
|
if (r < 0) {
|
|
|
|
kfree(new_o);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
o = __lookup_origin(bdev);
|
2008-10-30 13:33:12 +00:00
|
|
|
if (o)
|
|
|
|
kfree(new_o);
|
|
|
|
else {
|
2005-04-16 15:20:36 -07:00
|
|
|
/* New origin */
|
2008-10-30 13:33:12 +00:00
|
|
|
o = new_o;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Initialise the struct */
|
|
|
|
INIT_LIST_HEAD(&o->snapshots);
|
|
|
|
o->bdev = bdev;
|
|
|
|
|
|
|
|
__insert_origin(o);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
__insert_snapshot(o, snap);
|
|
|
|
|
|
|
|
out:
|
|
|
|
up_write(&_origins_lock);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move snapshot to correct place in list according to chunk size.
|
|
|
|
*/
|
|
|
|
static void reregister_snapshot(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
struct block_device *bdev = s->origin->bdev;
|
|
|
|
|
|
|
|
down_write(&_origins_lock);
|
|
|
|
|
|
|
|
list_del(&s->list);
|
|
|
|
__insert_snapshot(__lookup_origin(bdev), s);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
up_write(&_origins_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unregister_snapshot(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
struct origin *o;
|
|
|
|
|
|
|
|
down_write(&_origins_lock);
|
|
|
|
o = __lookup_origin(s->origin->bdev);
|
|
|
|
|
|
|
|
list_del(&s->list);
|
2009-12-10 23:52:24 +00:00
|
|
|
if (o && list_empty(&o->snapshots)) {
|
2005-04-16 15:20:36 -07:00
|
|
|
list_del(&o->hash_list);
|
|
|
|
kfree(o);
|
|
|
|
}
|
|
|
|
|
|
|
|
up_write(&_origins_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implementation of the exception hash tables.
|
2008-02-08 02:11:27 +00:00
|
|
|
* The lowest hash_shift bits of the chunk number are ignored, allowing
|
|
|
|
* some consecutive chunks to be grouped together.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2019-03-17 14:22:57 +02:00
|
|
|
static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
|
|
|
|
|
|
|
|
/* Lock to protect access to the completed and pending exception hash tables. */
|
|
|
|
struct dm_exception_table_lock {
|
|
|
|
struct hlist_bl_head *complete_slot;
|
|
|
|
struct hlist_bl_head *pending_slot;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
|
|
|
|
struct dm_exception_table_lock *lock)
|
|
|
|
{
|
|
|
|
struct dm_exception_table *complete = &s->complete;
|
|
|
|
struct dm_exception_table *pending = &s->pending;
|
|
|
|
|
|
|
|
lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
|
|
|
|
lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
|
|
|
|
{
|
|
|
|
hlist_bl_lock(lock->complete_slot);
|
|
|
|
hlist_bl_lock(lock->pending_slot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
|
|
|
|
{
|
|
|
|
hlist_bl_unlock(lock->pending_slot);
|
|
|
|
hlist_bl_unlock(lock->complete_slot);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
static int dm_exception_table_init(struct dm_exception_table *et,
|
|
|
|
uint32_t size, unsigned hash_shift)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
|
2008-02-08 02:11:27 +00:00
|
|
|
et->hash_shift = hash_shift;
|
2005-04-16 15:20:36 -07:00
|
|
|
et->hash_mask = size - 1;
|
2019-03-17 14:22:57 +02:00
|
|
|
et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!et->table)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
for (i = 0; i < size; i++)
|
2019-03-17 14:22:57 +02:00
|
|
|
INIT_HLIST_BL_HEAD(et->table + i);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
static void dm_exception_table_exit(struct dm_exception_table *et,
|
|
|
|
struct kmem_cache *mem)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2019-03-17 14:22:57 +02:00
|
|
|
struct hlist_bl_head *slot;
|
|
|
|
struct dm_exception *ex;
|
|
|
|
struct hlist_bl_node *pos, *n;
|
2005-04-16 15:20:36 -07:00
|
|
|
int i, size;
|
|
|
|
|
|
|
|
size = et->hash_mask + 1;
|
|
|
|
for (i = 0; i < size; i++) {
|
|
|
|
slot = et->table + i;
|
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
|
2005-04-16 15:20:36 -07:00
|
|
|
kmem_cache_free(mem, ex);
|
|
|
|
}
|
|
|
|
|
|
|
|
vfree(et->table);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:10 +00:00
|
|
|
static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-02-08 02:11:27 +00:00
|
|
|
return (chunk >> et->hash_shift) & et->hash_mask;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
static void dm_remove_exception(struct dm_exception *e)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2019-03-17 14:22:57 +02:00
|
|
|
hlist_bl_del(&e->hash_list);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the exception data for a sector, or NULL if not
|
|
|
|
* remapped.
|
|
|
|
*/
|
2009-12-10 23:52:11 +00:00
|
|
|
static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
|
|
|
|
chunk_t chunk)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2019-03-17 14:22:57 +02:00
|
|
|
struct hlist_bl_head *slot;
|
|
|
|
struct hlist_bl_node *pos;
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
slot = &et->table[exception_hash(et, chunk)];
|
2019-03-17 14:22:57 +02:00
|
|
|
hlist_bl_for_each_entry(e, pos, slot, hash_list)
|
2008-02-08 02:11:27 +00:00
|
|
|
if (chunk >= e->old_chunk &&
|
|
|
|
chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
|
2005-04-16 15:20:36 -07:00
|
|
|
return e;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-01-13 19:13:36 -05:00
|
|
|
static struct dm_exception *alloc_completed_exception(gfp_t gfp)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-01-13 19:13:36 -05:00
|
|
|
e = kmem_cache_alloc(exception_cache, gfp);
|
|
|
|
if (!e && gfp == GFP_NOIO)
|
2005-04-16 15:20:36 -07:00
|
|
|
e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
|
|
|
|
|
|
|
|
return e;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
static void free_completed_exception(struct dm_exception *e)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
kmem_cache_free(exception_cache, e);
|
|
|
|
}
|
|
|
|
|
2008-07-21 12:00:35 +01:00
|
|
|
static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-05-20 18:25:53 -04:00
|
|
|
struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
|
2008-07-21 12:00:35 +01:00
|
|
|
GFP_NOIO);
|
|
|
|
|
2008-10-30 13:33:16 +00:00
|
|
|
atomic_inc(&s->pending_exceptions_count);
|
2008-07-21 12:00:35 +01:00
|
|
|
pe->snap = s;
|
|
|
|
|
|
|
|
return pe;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-07-12 17:26:32 +01:00
|
|
|
static void free_pending_exception(struct dm_snap_pending_exception *pe)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-10-30 13:33:16 +00:00
|
|
|
struct dm_snapshot *s = pe->snap;
|
|
|
|
|
2018-05-20 18:25:53 -04:00
|
|
|
mempool_free(pe, &s->pending_pool);
|
2014-03-17 18:06:10 +01:00
|
|
|
smp_mb__before_atomic();
|
2008-10-30 13:33:16 +00:00
|
|
|
atomic_dec(&s->pending_exceptions_count);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
static void dm_insert_exception(struct dm_exception_table *eh,
|
|
|
|
struct dm_exception *new_e)
|
2008-02-08 02:11:27 +00:00
|
|
|
{
|
2019-03-17 14:22:57 +02:00
|
|
|
struct hlist_bl_head *l;
|
|
|
|
struct hlist_bl_node *pos;
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e = NULL;
|
2008-02-08 02:11:27 +00:00
|
|
|
|
|
|
|
l = &eh->table[exception_hash(eh, new_e->old_chunk)];
|
|
|
|
|
|
|
|
/* Add immediately if this table doesn't support consecutive chunks */
|
|
|
|
if (!eh->hash_shift)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* List is ordered by old_chunk */
|
2019-03-17 14:22:57 +02:00
|
|
|
hlist_bl_for_each_entry(e, pos, l, hash_list) {
|
2008-02-08 02:11:27 +00:00
|
|
|
/* Insert after an existing chunk? */
|
|
|
|
if (new_e->old_chunk == (e->old_chunk +
|
|
|
|
dm_consecutive_chunk_count(e) + 1) &&
|
|
|
|
new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
|
|
|
|
dm_consecutive_chunk_count(e) + 1)) {
|
|
|
|
dm_consecutive_chunk_count_inc(e);
|
2009-12-10 23:52:11 +00:00
|
|
|
free_completed_exception(new_e);
|
2008-02-08 02:11:27 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert before an existing chunk? */
|
|
|
|
if (new_e->old_chunk == (e->old_chunk - 1) &&
|
|
|
|
new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
|
|
|
|
dm_consecutive_chunk_count_inc(e);
|
|
|
|
e->old_chunk--;
|
|
|
|
e->new_chunk--;
|
2009-12-10 23:52:11 +00:00
|
|
|
free_completed_exception(new_e);
|
2008-02-08 02:11:27 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
if (new_e->old_chunk < e->old_chunk)
|
2008-02-08 02:11:27 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2019-03-17 14:22:57 +02:00
|
|
|
if (!e) {
|
|
|
|
/*
|
|
|
|
* Either the table doesn't support consecutive chunks or slot
|
|
|
|
* l is empty.
|
|
|
|
*/
|
|
|
|
hlist_bl_add_head(&new_e->hash_list, l);
|
|
|
|
} else if (new_e->old_chunk < e->old_chunk) {
|
|
|
|
/* Add before an existing exception */
|
|
|
|
hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
|
|
|
|
} else {
|
|
|
|
/* Add to l's tail: e is the last exception in this slot */
|
|
|
|
hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
|
|
|
|
}
|
2008-02-08 02:11:27 +00:00
|
|
|
}
|
|
|
|
|
2009-01-06 03:05:19 +00:00
|
|
|
/*
|
|
|
|
* Callback used by the exception stores to load exceptions when
|
|
|
|
* initialising.
|
|
|
|
*/
|
|
|
|
static int dm_add_exception(void *context, chunk_t old, chunk_t new)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2019-03-17 14:22:57 +02:00
|
|
|
struct dm_exception_table_lock lock;
|
2009-01-06 03:05:19 +00:00
|
|
|
struct dm_snapshot *s = context;
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-01-13 19:13:36 -05:00
|
|
|
e = alloc_completed_exception(GFP_KERNEL);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!e)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
e->old_chunk = old;
|
2008-02-08 02:11:27 +00:00
|
|
|
|
|
|
|
/* Consecutive_count is implicitly initialised to zero */
|
2005-04-16 15:20:36 -07:00
|
|
|
e->new_chunk = new;
|
2008-02-08 02:11:27 +00:00
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
/*
|
|
|
|
* Although there is no need to lock access to the exception tables
|
|
|
|
* here, if we don't then hlist_bl_add_head(), called by
|
|
|
|
* dm_insert_exception(), will complain about accessing the
|
|
|
|
* corresponding list without locking it first.
|
|
|
|
*/
|
|
|
|
dm_exception_table_lock_init(s, old, &lock);
|
|
|
|
|
|
|
|
dm_exception_table_lock(&lock);
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_insert_exception(&s->complete, e);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
2008-02-08 02:11:27 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:08 +00:00
|
|
|
/*
|
|
|
|
* Return a minimum chunk size of all snapshots that have the specified origin.
|
|
|
|
* Return zero if the origin has no snapshots.
|
|
|
|
*/
|
2012-07-27 15:08:00 +01:00
|
|
|
static uint32_t __minimum_chunk_size(struct origin *o)
|
2009-12-10 23:52:08 +00:00
|
|
|
{
|
|
|
|
struct dm_snapshot *snap;
|
|
|
|
unsigned chunk_size = 0;
|
|
|
|
|
|
|
|
if (o)
|
|
|
|
list_for_each_entry(snap, &o->snapshots, list)
|
|
|
|
chunk_size = min_not_zero(chunk_size,
|
|
|
|
snap->store->chunk_size);
|
|
|
|
|
2012-07-27 15:08:00 +01:00
|
|
|
return (uint32_t) chunk_size;
|
2009-12-10 23:52:08 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Hard coded magic.
|
|
|
|
*/
|
|
|
|
static int calc_max_buckets(void)
|
|
|
|
{
|
|
|
|
/* use a fixed size of 2MB */
|
|
|
|
unsigned long mem = 2 * 1024 * 1024;
|
2019-03-17 14:22:57 +02:00
|
|
|
mem /= sizeof(struct hlist_bl_head);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return mem;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate room for a suitable hash table.
|
|
|
|
*/
|
2009-04-02 19:55:34 +01:00
|
|
|
static int init_hash_tables(struct dm_snapshot *s)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2013-09-18 19:40:42 -04:00
|
|
|
sector_t hash_size, cow_dev_size, max_buckets;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate based on the size of the original volume or
|
|
|
|
* the COW volume...
|
|
|
|
*/
|
2009-12-10 23:52:12 +00:00
|
|
|
cow_dev_size = get_dev_size(s->cow->bdev);
|
2005-04-16 15:20:36 -07:00
|
|
|
max_buckets = calc_max_buckets();
|
|
|
|
|
2013-09-18 19:40:42 -04:00
|
|
|
hash_size = cow_dev_size >> s->store->chunk_shift;
|
2005-04-16 15:20:36 -07:00
|
|
|
hash_size = min(hash_size, max_buckets);
|
|
|
|
|
2009-12-10 23:51:54 +00:00
|
|
|
if (hash_size < 64)
|
|
|
|
hash_size = 64;
|
2008-02-08 02:10:06 +00:00
|
|
|
hash_size = rounddown_pow_of_two(hash_size);
|
2009-12-10 23:52:11 +00:00
|
|
|
if (dm_exception_table_init(&s->complete, hash_size,
|
|
|
|
DM_CHUNK_CONSECUTIVE_BITS))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate hash table for in-flight exceptions
|
|
|
|
* Make this smaller than the real hash table
|
|
|
|
*/
|
|
|
|
hash_size >>= 3;
|
|
|
|
if (hash_size < 64)
|
|
|
|
hash_size = 64;
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
if (dm_exception_table_init(&s->pending, hash_size, 0)) {
|
|
|
|
dm_exception_table_exit(&s->complete, exception_cache);
|
2005-04-16 15:20:36 -07:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
static void merge_shutdown(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
|
2014-03-17 18:06:10 +01:00
|
|
|
smp_mb__after_atomic();
|
2009-12-10 23:52:32 +00:00
|
|
|
wake_up_bit(&s->state_bits, RUNNING_MERGE);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
s->first_merging_chunk = 0;
|
|
|
|
s->num_merging_chunks = 0;
|
|
|
|
|
|
|
|
return bio_list_get(&s->bios_queued_during_merge);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
/*
|
|
|
|
* Remove one chunk from the index of completed exceptions.
|
|
|
|
*/
|
|
|
|
static int __remove_single_exception_chunk(struct dm_snapshot *s,
|
|
|
|
chunk_t old_chunk)
|
|
|
|
{
|
|
|
|
struct dm_exception *e;
|
|
|
|
|
|
|
|
e = dm_lookup_exception(&s->complete, old_chunk);
|
|
|
|
if (!e) {
|
|
|
|
DMERR("Corruption detected: exception for block %llu is "
|
|
|
|
"on disk but not in memory",
|
|
|
|
(unsigned long long)old_chunk);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is the only chunk using this exception, remove exception.
|
|
|
|
*/
|
|
|
|
if (!dm_consecutive_chunk_count(e)) {
|
|
|
|
dm_remove_exception(e);
|
|
|
|
free_completed_exception(e);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The chunk may be either at the beginning or the end of a
|
|
|
|
* group of consecutive chunks - never in the middle. We are
|
|
|
|
* removing chunks in the opposite order to that in which they
|
|
|
|
* were added, so this should always be true.
|
|
|
|
* Decrement the consecutive chunk counter and adjust the
|
|
|
|
* starting point if necessary.
|
|
|
|
*/
|
|
|
|
if (old_chunk == e->old_chunk) {
|
|
|
|
e->old_chunk++;
|
|
|
|
e->new_chunk++;
|
|
|
|
} else if (old_chunk != e->old_chunk +
|
|
|
|
dm_consecutive_chunk_count(e)) {
|
|
|
|
DMERR("Attempt to merge block %llu from the "
|
|
|
|
"middle of a chunk range [%llu - %llu]",
|
|
|
|
(unsigned long long)old_chunk,
|
|
|
|
(unsigned long long)e->old_chunk,
|
|
|
|
(unsigned long long)
|
|
|
|
e->old_chunk + dm_consecutive_chunk_count(e));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
dm_consecutive_chunk_count_dec(e);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
static void flush_bios(struct bio *bio);
|
|
|
|
|
|
|
|
static int remove_single_exception_chunk(struct dm_snapshot *s)
|
2009-12-10 23:52:32 +00:00
|
|
|
{
|
2009-12-10 23:52:33 +00:00
|
|
|
struct bio *b = NULL;
|
|
|
|
int r;
|
|
|
|
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2009-12-10 23:52:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Process chunks (and associated exceptions) in reverse order
|
|
|
|
* so that dm_consecutive_chunk_count_dec() accounting works.
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
r = __remove_single_exception_chunk(s, old_chunk);
|
|
|
|
if (r)
|
|
|
|
goto out;
|
|
|
|
} while (old_chunk-- > s->first_merging_chunk);
|
|
|
|
|
|
|
|
b = __release_queued_bios_after_merge(s);
|
|
|
|
|
|
|
|
out:
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2009-12-10 23:52:33 +00:00
|
|
|
if (b)
|
|
|
|
flush_bios(b);
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
static int origin_write_extent(struct dm_snapshot *merging_snap,
|
|
|
|
sector_t sector, unsigned chunk_size);
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
static void merge_callback(int read_err, unsigned long write_err,
|
|
|
|
void *context);
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
static uint64_t read_pending_exceptions_done_count(void)
|
|
|
|
{
|
|
|
|
uint64_t pending_exceptions_done;
|
|
|
|
|
|
|
|
spin_lock(&_pending_exceptions_done_spinlock);
|
|
|
|
pending_exceptions_done = _pending_exceptions_done_count;
|
|
|
|
spin_unlock(&_pending_exceptions_done_spinlock);
|
|
|
|
|
|
|
|
return pending_exceptions_done;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void increment_pending_exceptions_done_count(void)
|
|
|
|
{
|
|
|
|
spin_lock(&_pending_exceptions_done_spinlock);
|
|
|
|
_pending_exceptions_done_count++;
|
|
|
|
spin_unlock(&_pending_exceptions_done_spinlock);
|
|
|
|
|
|
|
|
wake_up_all(&_pending_exceptions_done);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
static void snapshot_merge_next_chunks(struct dm_snapshot *s)
|
|
|
|
{
|
2009-12-10 23:52:34 +00:00
|
|
|
int i, linear_chunks;
|
2009-12-10 23:52:32 +00:00
|
|
|
chunk_t old_chunk, new_chunk;
|
|
|
|
struct dm_io_region src, dest;
|
2009-12-10 23:52:34 +00:00
|
|
|
sector_t io_size;
|
2009-12-10 23:52:34 +00:00
|
|
|
uint64_t previous_count;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
|
|
|
|
if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
|
|
|
|
goto shut;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* valid flag never changes during merge, so no lock required.
|
|
|
|
*/
|
|
|
|
if (!s->valid) {
|
|
|
|
DMERR("Snapshot is invalid: can't merge");
|
|
|
|
goto shut;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
|
|
|
|
&new_chunk);
|
|
|
|
if (linear_chunks <= 0) {
|
2009-12-10 23:52:35 +00:00
|
|
|
if (linear_chunks < 0) {
|
2009-12-10 23:52:32 +00:00
|
|
|
DMERR("Read error in exception store: "
|
|
|
|
"shutting down merge");
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2019-12-24 14:38:02 +08:00
|
|
|
s->merge_failed = true;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2009-12-10 23:52:35 +00:00
|
|
|
}
|
2009-12-10 23:52:32 +00:00
|
|
|
goto shut;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
/* Adjust old_chunk and new_chunk to reflect start of linear region */
|
|
|
|
old_chunk = old_chunk + 1 - linear_chunks;
|
|
|
|
new_chunk = new_chunk + 1 - linear_chunks;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use one (potentially large) I/O to copy all 'linear_chunks'
|
|
|
|
* from the exception store to the origin
|
|
|
|
*/
|
|
|
|
io_size = linear_chunks * s->store->chunk_size;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
dest.bdev = s->origin->bdev;
|
|
|
|
dest.sector = chunk_to_sector(s->store, old_chunk);
|
2009-12-10 23:52:34 +00:00
|
|
|
dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
src.bdev = s->cow->bdev;
|
|
|
|
src.sector = chunk_to_sector(s->store, new_chunk);
|
|
|
|
src.count = dest.count;
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
/*
|
|
|
|
* Reallocate any exceptions needed in other snapshots then
|
|
|
|
* wait for the pending exceptions to complete.
|
|
|
|
* Each time any pending exception (globally on the system)
|
|
|
|
* completes we are woken and repeat the process to find out
|
|
|
|
* if we can proceed. While this may not seem a particularly
|
|
|
|
* efficient algorithm, it is not expected to have any
|
|
|
|
* significant impact on performance.
|
|
|
|
*/
|
|
|
|
previous_count = read_pending_exceptions_done_count();
|
2009-12-10 23:52:34 +00:00
|
|
|
while (origin_write_extent(s, dest.sector, io_size)) {
|
2009-12-10 23:52:34 +00:00
|
|
|
wait_event(_pending_exceptions_done,
|
|
|
|
(read_pending_exceptions_done_count() !=
|
|
|
|
previous_count));
|
|
|
|
/* Retry after the wait, until all exceptions are done. */
|
|
|
|
previous_count = read_pending_exceptions_done_count();
|
|
|
|
}
|
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2009-12-10 23:52:33 +00:00
|
|
|
s->first_merging_chunk = old_chunk;
|
2009-12-10 23:52:34 +00:00
|
|
|
s->num_merging_chunks = linear_chunks;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2009-12-10 23:52:33 +00:00
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
/* Wait until writes to all 'linear_chunks' drain */
|
|
|
|
for (i = 0; i < linear_chunks; i++)
|
|
|
|
__check_for_conflicting_io(s, old_chunk + i);
|
2009-12-10 23:52:33 +00:00
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
|
|
|
|
return;
|
|
|
|
|
|
|
|
shut:
|
|
|
|
merge_shutdown(s);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
static void error_bios(struct bio *bio);
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
static void merge_callback(int read_err, unsigned long write_err, void *context)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s = context;
|
2009-12-10 23:52:33 +00:00
|
|
|
struct bio *b = NULL;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
if (read_err || write_err) {
|
|
|
|
if (read_err)
|
|
|
|
DMERR("Read error: shutting down merge.");
|
|
|
|
else
|
|
|
|
DMERR("Write error: shutting down merge.");
|
|
|
|
goto shut;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
if (s->store->type->commit_merge(s->store,
|
|
|
|
s->num_merging_chunks) < 0) {
|
2009-12-10 23:52:32 +00:00
|
|
|
DMERR("Write error in exception store: shutting down merge");
|
|
|
|
goto shut;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:33 +00:00
|
|
|
if (remove_single_exception_chunk(s) < 0)
|
|
|
|
goto shut;
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
snapshot_merge_next_chunks(s);
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
shut:
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2019-12-24 14:38:02 +08:00
|
|
|
s->merge_failed = true;
|
2009-12-10 23:52:33 +00:00
|
|
|
b = __release_queued_bios_after_merge(s);
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2009-12-10 23:52:33 +00:00
|
|
|
error_bios(b);
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
merge_shutdown(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void start_merge(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
|
|
|
|
snapshot_merge_next_chunks(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop the merging process and wait until it finishes.
|
|
|
|
*/
|
|
|
|
static void stop_merge(struct dm_snapshot *s)
|
|
|
|
{
|
|
|
|
set_bit(SHUTDOWN_MERGE, &s->state_bits);
|
sched: Remove proliferation of wait_on_bit() action functions
The current "wait_on_bit" interface requires an 'action'
function to be provided which does the actual waiting.
There are over 20 such functions, many of them identical.
Most cases can be satisfied by one of just two functions, one
which uses io_schedule() and one which just uses schedule().
So:
Rename wait_on_bit and wait_on_bit_lock to
wait_on_bit_action and wait_on_bit_lock_action
to make it explicit that they need an action function.
Introduce new wait_on_bit{,_lock} and wait_on_bit{,_lock}_io
which are *not* given an action function but implicitly use
a standard one.
The decision to error-out if a signal is pending is now made
based on the 'mode' argument rather than being encoded in the action
function.
All instances of the old wait_on_bit and wait_on_bit_lock which
can use the new version have been changed accordingly and their
action functions have been discarded.
wait_on_bit{_lock} does not return any specific error code in the
event of a signal so the caller must check for non-zero and
interpolate their own error code as appropriate.
The wait_on_bit() call in __fscache_wait_on_invalidate() was
ambiguous as it specified TASK_UNINTERRUPTIBLE but used
fscache_wait_bit_interruptible as an action function.
David Howells confirms this should be uniformly
"uninterruptible"
The main remaining user of wait_on_bit{,_lock}_action is NFS
which needs to use a freezer-aware schedule() call.
A comment in fs/gfs2/glock.c notes that having multiple 'action'
functions is useful as they display differently in the 'wchan'
field of 'ps'. (and /proc/$PID/wchan).
As the new bit_wait{,_io} functions are tagged "__sched", they
will not show up at all, but something higher in the stack. So
the distinction will still be visible, only with different
function names (gds2_glock_wait versus gfs2_glock_dq_wait in the
gfs2/glock.c case).
Since first version of this patch (against 3.15) two new action
functions appeared, on in NFS and one in CIFS. CIFS also now
uses an action function that makes the same freezer aware
schedule call as NFS.
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: David Howells <dhowells@redhat.com> (fscache, keys)
Acked-by: Steven Whitehouse <swhiteho@redhat.com> (gfs2)
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steve French <sfrench@samba.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140707051603.28027.72349.stgit@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-07-07 15:16:04 +10:00
|
|
|
wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
|
2009-12-10 23:52:32 +00:00
|
|
|
clear_bit(SHUTDOWN_MERGE, &s->state_bits);
|
|
|
|
}
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
|
|
|
|
struct dm_target *ti)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
unsigned argc;
|
|
|
|
const char *arg_name;
|
|
|
|
|
|
|
|
static const struct dm_arg _args[] = {
|
|
|
|
{0, 2, "Invalid number of feature arguments"},
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No feature arguments supplied.
|
|
|
|
*/
|
|
|
|
if (!as->argc)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
r = dm_read_arg_group(_args, as, &argc, &ti->error);
|
|
|
|
if (r)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
while (argc && !r) {
|
|
|
|
arg_name = dm_shift_arg(as);
|
|
|
|
argc--;
|
|
|
|
|
|
|
|
if (!strcasecmp(arg_name, "discard_zeroes_cow"))
|
|
|
|
s->discard_zeroes_cow = true;
|
|
|
|
|
|
|
|
else if (!strcasecmp(arg_name, "discard_passdown_origin"))
|
|
|
|
s->discard_passdown_origin = true;
|
|
|
|
|
|
|
|
else {
|
|
|
|
ti->error = "Unrecognised feature requested";
|
|
|
|
r = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
|
|
|
|
/*
|
|
|
|
* TODO: really these are disjoint.. but ti->num_discard_bios
|
|
|
|
* and dm_bio_get_target_bio_nr() require rigid constraints.
|
|
|
|
*/
|
|
|
|
ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
|
|
|
|
r = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2019-06-19 17:05:54 -04:00
|
|
|
* Construct a snapshot mapping:
|
|
|
|
* <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s;
|
2019-06-19 17:05:54 -04:00
|
|
|
struct dm_arg_set as;
|
2008-07-21 12:00:32 +01:00
|
|
|
int i;
|
2005-04-16 15:20:36 -07:00
|
|
|
int r = -EINVAL;
|
2009-12-10 23:52:12 +00:00
|
|
|
char *origin_path, *cow_path;
|
2016-02-02 12:29:18 +08:00
|
|
|
dev_t origin_dev, cow_dev;
|
2013-03-01 22:45:47 +00:00
|
|
|
unsigned args_used, num_flush_bios = 1;
|
2009-12-10 23:52:31 +00:00
|
|
|
fmode_t origin_mode = FMODE_READ;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
if (argc < 4) {
|
|
|
|
ti->error = "requires 4 or more arguments";
|
2005-04-16 15:20:36 -07:00
|
|
|
r = -EINVAL;
|
2009-12-10 23:52:12 +00:00
|
|
|
goto bad;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:31 +00:00
|
|
|
if (dm_target_is_snapshot_merge(ti)) {
|
2013-03-01 22:45:47 +00:00
|
|
|
num_flush_bios = 2;
|
2009-12-10 23:52:31 +00:00
|
|
|
origin_mode = FMODE_WRITE;
|
|
|
|
}
|
|
|
|
|
2018-06-05 05:26:33 -04:00
|
|
|
s = kzalloc(sizeof(*s), GFP_KERNEL);
|
2009-12-10 23:52:12 +00:00
|
|
|
if (!s) {
|
2011-08-02 12:32:03 +01:00
|
|
|
ti->error = "Cannot allocate private snapshot structure";
|
2009-12-10 23:52:12 +00:00
|
|
|
r = -ENOMEM;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
as.argc = argc;
|
|
|
|
as.argv = argv;
|
|
|
|
dm_consume_args(&as, 4);
|
|
|
|
r = parse_snapshot_features(&as, s, ti);
|
|
|
|
if (r)
|
|
|
|
goto bad_features;
|
|
|
|
|
2010-08-12 04:13:51 +01:00
|
|
|
origin_path = argv[0];
|
|
|
|
argv++;
|
|
|
|
argc--;
|
|
|
|
|
|
|
|
r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
|
|
|
|
if (r) {
|
|
|
|
ti->error = "Cannot get origin device";
|
|
|
|
goto bad_origin;
|
|
|
|
}
|
2016-02-02 12:29:18 +08:00
|
|
|
origin_dev = s->origin->bdev->bd_dev;
|
2010-08-12 04:13:51 +01:00
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
cow_path = argv[0];
|
|
|
|
argv++;
|
|
|
|
argc--;
|
|
|
|
|
2016-02-02 12:29:18 +08:00
|
|
|
cow_dev = dm_get_dev_t(cow_path);
|
|
|
|
if (cow_dev && cow_dev == origin_dev) {
|
|
|
|
ti->error = "COW device cannot be the same as origin device";
|
|
|
|
r = -EINVAL;
|
|
|
|
goto bad_cow;
|
|
|
|
}
|
|
|
|
|
2011-03-24 13:52:14 +00:00
|
|
|
r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
|
2009-12-10 23:52:12 +00:00
|
|
|
if (r) {
|
|
|
|
ti->error = "Cannot get COW device";
|
|
|
|
goto bad_cow;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
|
2009-04-02 19:55:34 +01:00
|
|
|
if (r) {
|
|
|
|
ti->error = "Couldn't create exception store";
|
2005-04-16 15:20:36 -07:00
|
|
|
r = -EINVAL;
|
2009-12-10 23:52:12 +00:00
|
|
|
goto bad_store;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
argv += args_used;
|
|
|
|
argc -= args_used;
|
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
s->ti = ti;
|
2005-04-16 15:20:36 -07:00
|
|
|
s->valid = 1;
|
2015-06-21 16:31:33 -04:00
|
|
|
s->snapshot_overflowed = 0;
|
2006-02-01 03:04:50 -08:00
|
|
|
s->active = 0;
|
2008-10-30 13:33:16 +00:00
|
|
|
atomic_set(&s->pending_exceptions_count, 0);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
spin_lock_init(&s->pe_allocation_lock);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
s->exception_start_sequence = 0;
|
|
|
|
s->exception_complete_sequence = 0;
|
2018-08-07 16:56:00 -04:00
|
|
|
s->out_of_order_tree = RB_ROOT;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
init_rwsem(&s->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
INIT_LIST_HEAD(&s->list);
|
2006-10-03 01:15:30 -07:00
|
|
|
spin_lock_init(&s->pe_lock);
|
2009-12-10 23:52:32 +00:00
|
|
|
s->state_bits = 0;
|
2019-12-24 14:38:02 +08:00
|
|
|
s->merge_failed = false;
|
2009-12-10 23:52:33 +00:00
|
|
|
s->first_merging_chunk = 0;
|
|
|
|
s->num_merging_chunks = 0;
|
|
|
|
bio_list_init(&s->bios_queued_during_merge);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Allocate hash table for COW data */
|
2009-04-02 19:55:34 +01:00
|
|
|
if (init_hash_tables(s)) {
|
2005-04-16 15:20:36 -07:00
|
|
|
ti->error = "Unable to allocate hash table space";
|
|
|
|
r = -ENOMEM;
|
2009-04-02 19:55:34 +01:00
|
|
|
goto bad_hash_tables;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
init_waitqueue_head(&s->in_progress_wait);
|
dm snapshot: Fix excessive memory usage and workqueue stalls
kcopyd has no upper limit to the number of jobs one can allocate and
issue. Under certain workloads this can lead to excessive memory usage
and workqueue stalls. For example, when creating multiple dm-snapshot
targets with a 4K chunk size and then writing to the origin through the
page cache. Syncing the page cache causes a large number of BIOs to be
issued to the dm-snapshot origin target, which itself issues an even
larger (because of the BIO splitting taking place) number of kcopyd
jobs.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
, with 8 active snapshots, results in the kcopyd job slab cache growing
to 10G. Depending on the available system RAM this can lead to the OOM
killer killing user processes:
[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP),
nodemask=(null), order=1, oom_score_adj=0
[463.492894] kthreadd cpuset=/ mems_allowed=0
[463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3
[463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[463.492952] Call Trace:
[463.492964] dump_stack+0x7d/0xbb
[463.492973] dump_header+0x6b/0x2fc
[463.492987] ? lockdep_hardirqs_on+0xee/0x190
[463.493012] oom_kill_process+0x302/0x370
[463.493021] out_of_memory+0x113/0x560
[463.493030] __alloc_pages_slowpath+0xf40/0x1020
[463.493055] __alloc_pages_nodemask+0x348/0x3c0
[463.493067] cache_grow_begin+0x81/0x8b0
[463.493072] ? cache_grow_begin+0x874/0x8b0
[463.493078] fallback_alloc+0x1e4/0x280
[463.493092] kmem_cache_alloc_node+0xd6/0x370
[463.493098] ? copy_process.part.31+0x1c5/0x20d0
[463.493105] copy_process.part.31+0x1c5/0x20d0
[463.493115] ? __lock_acquire+0x3cc/0x1550
[463.493121] ? __switch_to_asm+0x34/0x70
[463.493129] ? kthread_create_worker_on_cpu+0x70/0x70
[463.493135] ? finish_task_switch+0x90/0x280
[463.493165] _do_fork+0xe0/0x6d0
[463.493191] ? kthreadd+0x19f/0x220
[463.493233] kernel_thread+0x25/0x30
[463.493235] kthreadd+0x1bf/0x220
[463.493242] ? kthread_create_on_cpu+0x90/0x90
[463.493248] ret_from_fork+0x3a/0x50
[463.493279] Mem-Info:
[463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0
[463.493285] active_file:80216 inactive_file:80107 isolated_file:435
[463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0
[463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521
[463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0
[463.493285] free:33623 free_pcp:2392 free_cma:0
...
[463.493489] Unreclaimable slab info:
[463.493513] Name Used Total
[463.493522] bio-6 1028KB 1028KB
[463.493525] bio-5 1028KB 1028KB
[463.493528] dm_snap_pending_exception 236783KB 243789KB
[463.493531] dm_exception 41KB 42KB
[463.493534] bio-4 1216KB 1216KB
[463.493537] bio-3 439396KB 439396KB
[463.493539] kcopyd_job 6973427KB 6973427KB
...
[463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child
[463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB
[463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Moreover, issuing a large number of kcopyd jobs results in kcopyd
hogging the CPU, while processing them. As a result, processing of work
items, queued for execution on the same CPU as the currently running
kcopyd thread, is stalled for long periods of time, hurting performance.
Running the aforementioned test we get, in dmesg, messages like the
following:
[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s!
[67501.195586] Showing busy workqueues and worker pools:
[67501.195591] workqueue events: flags=0x0
[67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195611] pending: cache_reap
[67501.195641] workqueue mm_percpu_wq: flags=0x8
[67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195656] pending: vmstat_update
[67501.195682] workqueue kblockd: flags=0x18
[67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256
[67501.195698] pending: blk_timeout_work
[67501.195753] workqueue kcopyd: flags=0x8
[67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195768] pending: do_work [dm_mod]
[67501.195802] workqueue kcopyd: flags=0x8
[67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195817] pending: do_work [dm_mod]
[67501.195834] workqueue kcopyd: flags=0x8
[67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195848] pending: do_work [dm_mod]
[67501.195881] workqueue kcopyd: flags=0x8
[67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256
[67501.195896] pending: do_work [dm_mod]
[67501.195920] workqueue kcopyd: flags=0x8
[67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256
[67501.195935] in-flight: 67:do_work [dm_mod]
[67501.195945] pending: do_work [dm_mod]
[67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
The root cause for these issues is the way dm-snapshot uses kcopyd. In
particular, the lack of an explicit or implicit limit to the maximum
number of in-flight COW jobs. The merging path is not affected because
it implicitly limits the in-flight kcopyd jobs to one.
Fix these issues by using a semaphore to limit the maximum number of
in-flight kcopyd jobs. We grab the semaphore before allocating a new
kcopyd job in start_copy() and start_full_bio() and release it after the
job finishes in copy_callback().
The initial semaphore value is configurable through a module parameter,
to allow fine tuning the maximum number of in-flight COW jobs. Setting
this parameter to zero initializes the semaphore to INT_MAX.
A default value of 2048 maximum in-flight kcopyd jobs was chosen. This
value was decided experimentally as a trade-off between memory
consumption, stalling the kernel's workqueues and maintaining a high
enough throughput.
Re-running the aforementioned test:
* Workqueue stalls are eliminated
* kcopyd's job slab cache uses a maximum of 130MB
* The time taken by the test to write to the snapshot-origin target is
reduced from 05m20.48s to 03m26.38s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2018-10-31 17:53:08 -04:00
|
|
|
|
2013-03-01 22:45:49 +00:00
|
|
|
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
|
2011-05-29 13:03:13 +01:00
|
|
|
if (IS_ERR(s->kcopyd_client)) {
|
|
|
|
r = PTR_ERR(s->kcopyd_client);
|
2005-04-16 15:20:36 -07:00
|
|
|
ti->error = "Could not create kcopyd client";
|
2009-04-02 19:55:34 +01:00
|
|
|
goto bad_kcopyd;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2018-05-20 18:25:53 -04:00
|
|
|
r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
|
|
|
|
if (r) {
|
2008-07-21 12:00:35 +01:00
|
|
|
ti->error = "Could not allocate mempool for pending exceptions";
|
2009-04-02 19:55:34 +01:00
|
|
|
goto bad_pending_pool;
|
2008-07-21 12:00:35 +01:00
|
|
|
}
|
|
|
|
|
2008-07-21 12:00:32 +01:00
|
|
|
for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
|
|
|
|
INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
|
|
|
|
|
|
|
|
spin_lock_init(&s->tracked_chunk_lock);
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
ti->private = s;
|
2013-03-01 22:45:47 +00:00
|
|
|
ti->num_flush_bios = num_flush_bios;
|
2019-06-19 17:05:54 -04:00
|
|
|
if (s->discard_zeroes_cow)
|
|
|
|
ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
|
2016-01-31 13:28:26 -05:00
|
|
|
ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
|
2009-12-10 23:52:24 +00:00
|
|
|
|
|
|
|
/* Add snapshot to the list of snapshots for this origin */
|
|
|
|
/* Exceptions aren't triggered till snapshot_resume() is called */
|
|
|
|
r = register_snapshot(s);
|
|
|
|
if (r == -ENOMEM) {
|
|
|
|
ti->error = "Snapshot origin struct allocation failed";
|
|
|
|
goto bad_load_and_register;
|
|
|
|
} else if (r < 0) {
|
|
|
|
/* invalid handover, register_snapshot has set ti->error */
|
|
|
|
goto bad_load_and_register;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Metadata must only be loaded into one table at once, so skip this
|
|
|
|
* if metadata will be handed over during resume.
|
|
|
|
* Chunk size will be set during the handover - set it to zero to
|
|
|
|
* ensure it's ignored.
|
|
|
|
*/
|
|
|
|
if (r > 0) {
|
|
|
|
s->store->chunk_size = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-04-02 19:55:31 +01:00
|
|
|
r = s->store->type->read_metadata(s->store, dm_add_exception,
|
|
|
|
(void *)s);
|
2007-07-12 17:28:13 +01:00
|
|
|
if (r < 0) {
|
2006-10-03 01:15:25 -07:00
|
|
|
ti->error = "Failed to read snapshot metadata";
|
2009-12-10 23:52:24 +00:00
|
|
|
goto bad_read_metadata;
|
2007-07-12 17:28:13 +01:00
|
|
|
} else if (r > 0) {
|
|
|
|
s->valid = 0;
|
|
|
|
DMWARN("Snapshot is marked invalid.");
|
2006-10-03 01:15:25 -07:00
|
|
|
}
|
2006-02-01 03:04:50 -08:00
|
|
|
|
2009-10-16 23:18:16 +01:00
|
|
|
if (!s->store->chunk_size) {
|
|
|
|
ti->error = "Chunk size not set";
|
2009-12-10 23:52:24 +00:00
|
|
|
goto bad_read_metadata;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2012-07-27 15:08:00 +01:00
|
|
|
|
|
|
|
r = dm_set_target_max_io_len(ti, s->store->chunk_size);
|
|
|
|
if (r)
|
|
|
|
goto bad_read_metadata;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
bad_read_metadata:
|
|
|
|
unregister_snapshot(s);
|
2009-04-02 19:55:34 +01:00
|
|
|
bad_load_and_register:
|
2018-05-20 18:25:53 -04:00
|
|
|
mempool_exit(&s->pending_pool);
|
2009-04-02 19:55:34 +01:00
|
|
|
bad_pending_pool:
|
2008-04-24 21:43:19 +01:00
|
|
|
dm_kcopyd_client_destroy(s->kcopyd_client);
|
2009-04-02 19:55:34 +01:00
|
|
|
bad_kcopyd:
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_exception_table_exit(&s->pending, pending_cache);
|
|
|
|
dm_exception_table_exit(&s->complete, exception_cache);
|
2009-04-02 19:55:34 +01:00
|
|
|
bad_hash_tables:
|
2009-12-10 23:52:12 +00:00
|
|
|
dm_exception_store_destroy(s->store);
|
|
|
|
bad_store:
|
|
|
|
dm_put_device(ti, s->cow);
|
|
|
|
bad_cow:
|
2010-08-12 04:13:51 +01:00
|
|
|
dm_put_device(ti, s->origin);
|
|
|
|
bad_origin:
|
2019-06-19 17:05:54 -04:00
|
|
|
bad_features:
|
2009-12-10 23:52:12 +00:00
|
|
|
kfree(s);
|
|
|
|
bad:
|
2005-04-16 15:20:36 -07:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2006-12-08 02:41:11 -08:00
|
|
|
static void __free_exceptions(struct dm_snapshot *s)
|
|
|
|
{
|
2008-04-24 21:43:19 +01:00
|
|
|
dm_kcopyd_client_destroy(s->kcopyd_client);
|
2006-12-08 02:41:11 -08:00
|
|
|
s->kcopyd_client = NULL;
|
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_exception_table_exit(&s->pending, pending_cache);
|
|
|
|
dm_exception_table_exit(&s->complete, exception_cache);
|
2006-12-08 02:41:11 -08:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
static void __handover_exceptions(struct dm_snapshot *snap_src,
|
|
|
|
struct dm_snapshot *snap_dest)
|
|
|
|
{
|
|
|
|
union {
|
|
|
|
struct dm_exception_table table_swap;
|
|
|
|
struct dm_exception_store *store_swap;
|
|
|
|
} u;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Swap all snapshot context information between the two instances.
|
|
|
|
*/
|
|
|
|
u.table_swap = snap_dest->complete;
|
|
|
|
snap_dest->complete = snap_src->complete;
|
|
|
|
snap_src->complete = u.table_swap;
|
|
|
|
|
|
|
|
u.store_swap = snap_dest->store;
|
|
|
|
snap_dest->store = snap_src->store;
|
2015-10-08 18:05:41 -04:00
|
|
|
snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
|
2009-12-10 23:52:24 +00:00
|
|
|
snap_src->store = u.store_swap;
|
|
|
|
|
|
|
|
snap_dest->store->snap = snap_dest;
|
|
|
|
snap_src->store->snap = snap_src;
|
|
|
|
|
2012-07-27 15:08:00 +01:00
|
|
|
snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
|
2009-12-10 23:52:24 +00:00
|
|
|
snap_dest->valid = snap_src->valid;
|
2015-06-21 16:31:33 -04:00
|
|
|
snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
|
2009-12-10 23:52:24 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set source invalid to ensure it receives no further I/O.
|
|
|
|
*/
|
|
|
|
snap_src->valid = 0;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static void snapshot_dtr(struct dm_target *ti)
|
|
|
|
{
|
2008-07-21 12:00:32 +01:00
|
|
|
#ifdef CONFIG_DM_DEBUG
|
|
|
|
int i;
|
|
|
|
#endif
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snapshot *s = ti->private;
|
2009-12-10 23:52:24 +00:00
|
|
|
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
down_read(&_origins_lock);
|
|
|
|
/* Check whether exception handover must be cancelled */
|
2009-12-10 23:52:32 +00:00
|
|
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
2009-12-10 23:52:24 +00:00
|
|
|
if (snap_src && snap_dest && (s == snap_src)) {
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&snap_dest->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
snap_dest->valid = 0;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&snap_dest->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
DMERR("Cancelling snapshot handover.");
|
|
|
|
}
|
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
if (dm_target_is_snapshot_merge(ti))
|
|
|
|
stop_merge(s);
|
|
|
|
|
2006-03-27 01:17:50 -08:00
|
|
|
/* Prevent further origin writes from using this snapshot. */
|
|
|
|
/* After this returns there can be no new kcopyd jobs. */
|
2005-04-16 15:20:36 -07:00
|
|
|
unregister_snapshot(s);
|
|
|
|
|
2008-10-30 13:33:16 +00:00
|
|
|
while (atomic_read(&s->pending_exceptions_count))
|
2009-01-06 03:04:54 +00:00
|
|
|
msleep(1);
|
2008-10-30 13:33:16 +00:00
|
|
|
/*
|
2018-05-20 18:25:53 -04:00
|
|
|
* Ensure instructions in mempool_exit aren't reordered
|
2008-10-30 13:33:16 +00:00
|
|
|
* before atomic_read.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
|
2008-07-21 12:00:32 +01:00
|
|
|
#ifdef CONFIG_DM_DEBUG
|
|
|
|
for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
|
|
|
|
BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
|
|
|
|
#endif
|
|
|
|
|
2006-12-08 02:41:11 -08:00
|
|
|
__free_exceptions(s);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-05-20 18:25:53 -04:00
|
|
|
mempool_exit(&s->pending_pool);
|
2008-07-21 12:00:35 +01:00
|
|
|
|
2009-04-02 19:55:34 +01:00
|
|
|
dm_exception_store_destroy(s->store);
|
2006-03-27 01:17:50 -08:00
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
dm_put_device(ti, s->cow);
|
|
|
|
|
2010-08-12 04:13:51 +01:00
|
|
|
dm_put_device(ti, s->origin);
|
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
WARN_ON(s->in_progress);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
kfree(s);
|
|
|
|
}
|
|
|
|
|
2019-10-02 06:14:17 -04:00
|
|
|
static void account_start_copy(struct dm_snapshot *s)
|
|
|
|
{
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
spin_lock(&s->in_progress_wait.lock);
|
|
|
|
s->in_progress++;
|
|
|
|
spin_unlock(&s->in_progress_wait.lock);
|
2019-10-02 06:14:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void account_end_copy(struct dm_snapshot *s)
|
|
|
|
{
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
spin_lock(&s->in_progress_wait.lock);
|
|
|
|
BUG_ON(!s->in_progress);
|
|
|
|
s->in_progress--;
|
|
|
|
if (likely(s->in_progress <= cow_threshold) &&
|
|
|
|
unlikely(waitqueue_active(&s->in_progress_wait)))
|
|
|
|
wake_up_locked(&s->in_progress_wait);
|
|
|
|
spin_unlock(&s->in_progress_wait.lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
|
|
|
|
{
|
|
|
|
if (unlikely(s->in_progress > cow_threshold)) {
|
|
|
|
spin_lock(&s->in_progress_wait.lock);
|
|
|
|
if (likely(s->in_progress > cow_threshold)) {
|
|
|
|
/*
|
|
|
|
* NOTE: this throttle doesn't account for whether
|
|
|
|
* the caller is servicing an IO that will trigger a COW
|
|
|
|
* so excess throttling may result for chunks not required
|
|
|
|
* to be COW'd. But if cow_threshold was reached, extra
|
|
|
|
* throttling is unlikely to negatively impact performance.
|
|
|
|
*/
|
|
|
|
DECLARE_WAITQUEUE(wait, current);
|
|
|
|
__add_wait_queue(&s->in_progress_wait, &wait);
|
|
|
|
__set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
spin_unlock(&s->in_progress_wait.lock);
|
|
|
|
if (unlock_origins)
|
|
|
|
up_read(&_origins_lock);
|
|
|
|
io_schedule();
|
|
|
|
remove_wait_queue(&s->in_progress_wait, &wait);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
spin_unlock(&s->in_progress_wait.lock);
|
|
|
|
}
|
|
|
|
return true;
|
2019-10-02 06:14:17 -04:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Flush a list of buffers.
|
|
|
|
*/
|
|
|
|
static void flush_bios(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct bio *n;
|
|
|
|
|
|
|
|
while (bio) {
|
|
|
|
n = bio->bi_next;
|
|
|
|
bio->bi_next = NULL;
|
|
|
|
generic_make_request(bio);
|
|
|
|
bio = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
|
2009-12-10 23:52:30 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush a list of buffers.
|
|
|
|
*/
|
|
|
|
static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
|
|
|
|
{
|
|
|
|
struct bio *n;
|
|
|
|
int r;
|
|
|
|
|
|
|
|
while (bio) {
|
|
|
|
n = bio->bi_next;
|
|
|
|
bio->bi_next = NULL;
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
r = do_origin(s->origin, bio, false);
|
2009-12-10 23:52:30 +00:00
|
|
|
if (r == DM_MAPIO_REMAPPED)
|
|
|
|
generic_make_request(bio);
|
|
|
|
bio = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Error a list of buffers.
|
|
|
|
*/
|
|
|
|
static void error_bios(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct bio *n;
|
|
|
|
|
|
|
|
while (bio) {
|
|
|
|
n = bio->bi_next;
|
|
|
|
bio->bi_next = NULL;
|
2007-09-27 12:47:43 +02:00
|
|
|
bio_io_error(bio);
|
2005-04-16 15:20:36 -07:00
|
|
|
bio = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-10-03 01:15:31 -07:00
|
|
|
static void __invalidate_snapshot(struct dm_snapshot *s, int err)
|
2006-03-27 01:17:45 -08:00
|
|
|
{
|
|
|
|
if (!s->valid)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (err == -EIO)
|
|
|
|
DMERR("Invalidating snapshot: Error reading/writing.");
|
|
|
|
else if (err == -ENOMEM)
|
|
|
|
DMERR("Invalidating snapshot: Unable to allocate exception.");
|
|
|
|
|
2009-04-02 19:55:31 +01:00
|
|
|
if (s->store->type->drop_snapshot)
|
|
|
|
s->store->type->drop_snapshot(s->store);
|
2006-03-27 01:17:45 -08:00
|
|
|
|
|
|
|
s->valid = 0;
|
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
dm_table_event(s->ti->table);
|
2006-03-27 01:17:45 -08:00
|
|
|
}
|
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
static void invalidate_snapshot(struct dm_snapshot *s, int err)
|
|
|
|
{
|
|
|
|
down_write(&s->lock);
|
|
|
|
__invalidate_snapshot(s, err);
|
|
|
|
up_write(&s->lock);
|
|
|
|
}
|
|
|
|
|
2016-01-08 19:07:55 -05:00
|
|
|
static void pending_complete(void *context, int success)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2016-01-08 19:07:55 -05:00
|
|
|
struct dm_snap_pending_exception *pe = context;
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct dm_snapshot *s = pe->snap;
|
2006-10-03 01:15:29 -07:00
|
|
|
struct bio *origin_bios = NULL;
|
|
|
|
struct bio *snapshot_bios = NULL;
|
2011-08-02 12:32:04 +01:00
|
|
|
struct bio *full_bio = NULL;
|
2019-03-17 14:22:57 +02:00
|
|
|
struct dm_exception_table_lock lock;
|
2006-10-03 01:15:29 -07:00
|
|
|
int error = 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
|
|
|
|
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!success) {
|
|
|
|
/* Read/write error - snapshot is unusable */
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
invalidate_snapshot(s, -EIO);
|
2006-10-03 01:15:29 -07:00
|
|
|
error = 1;
|
2019-03-17 14:22:57 +02:00
|
|
|
|
|
|
|
dm_exception_table_lock(&lock);
|
2006-03-27 01:17:45 -08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-01-13 19:13:36 -05:00
|
|
|
e = alloc_completed_exception(GFP_NOIO);
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!e) {
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
invalidate_snapshot(s, -ENOMEM);
|
2006-10-03 01:15:29 -07:00
|
|
|
error = 1;
|
2019-03-17 14:22:57 +02:00
|
|
|
|
|
|
|
dm_exception_table_lock(&lock);
|
2006-03-27 01:17:45 -08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
*e = pe->e;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
down_read(&s->lock);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!s->valid) {
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
2009-12-10 23:52:11 +00:00
|
|
|
free_completed_exception(e);
|
2006-10-03 01:15:29 -07:00
|
|
|
error = 1;
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
|
2006-03-27 01:17:45 -08:00
|
|
|
goto out;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2006-10-03 01:15:29 -07:00
|
|
|
/*
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
* Add a proper exception. After inserting the completed exception all
|
|
|
|
* subsequent snapshot reads to this chunk will be redirected to the
|
|
|
|
* COW device. This ensures that we do not starve. Moreover, as long
|
|
|
|
* as the pending exception exists, neither origin writes nor snapshot
|
|
|
|
* merging can overwrite the chunk in origin.
|
2006-10-03 01:15:29 -07:00
|
|
|
*/
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_insert_exception(&s->complete, e);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
2006-03-27 01:17:45 -08:00
|
|
|
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
/* Wait for conflicting reads to drain */
|
|
|
|
if (__chunk_is_tracked(s, pe->e.old_chunk)) {
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
__check_for_conflicting_io(s, pe->e.old_chunk);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
}
|
|
|
|
|
2011-08-02 12:32:03 +01:00
|
|
|
out:
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
/* Remove the in-flight exception from the list */
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_remove_exception(&pe->e);
|
2019-03-17 14:22:57 +02:00
|
|
|
|
|
|
|
dm_exception_table_unlock(&lock);
|
|
|
|
|
2006-10-03 01:15:29 -07:00
|
|
|
snapshot_bios = bio_list_get(&pe->snapshot_bios);
|
2009-12-10 23:52:30 +00:00
|
|
|
origin_bios = bio_list_get(&pe->origin_bios);
|
2011-08-02 12:32:04 +01:00
|
|
|
full_bio = pe->full_bio;
|
2015-11-25 16:03:31 -05:00
|
|
|
if (full_bio)
|
2011-08-02 12:32:04 +01:00
|
|
|
full_bio->bi_end_io = pe->full_bio_end_io;
|
2009-12-10 23:52:34 +00:00
|
|
|
increment_pending_exceptions_done_count();
|
|
|
|
|
2006-10-03 01:15:29 -07:00
|
|
|
/* Submit any pending write bios */
|
2011-08-02 12:32:04 +01:00
|
|
|
if (error) {
|
|
|
|
if (full_bio)
|
|
|
|
bio_io_error(full_bio);
|
2006-10-03 01:15:29 -07:00
|
|
|
error_bios(snapshot_bios);
|
2011-08-02 12:32:04 +01:00
|
|
|
} else {
|
|
|
|
if (full_bio)
|
2015-07-20 15:29:37 +02:00
|
|
|
bio_endio(full_bio);
|
2006-10-03 01:15:29 -07:00
|
|
|
flush_bios(snapshot_bios);
|
2011-08-02 12:32:04 +01:00
|
|
|
}
|
2006-10-03 01:15:29 -07:00
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
retry_origin_bios(s, origin_bios);
|
2015-02-17 14:34:00 -05:00
|
|
|
|
|
|
|
free_pending_exception(pe);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
static void complete_exception(struct dm_snap_pending_exception *pe)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s = pe->snap;
|
|
|
|
|
2016-01-08 19:07:55 -05:00
|
|
|
/* Update the metadata if we are persistent */
|
|
|
|
s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
|
|
|
|
pending_complete, pe);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Called when the copy I/O has finished. kcopyd actually runs
|
|
|
|
* this code so don't block.
|
|
|
|
*/
|
2008-03-28 14:16:10 -07:00
|
|
|
static void copy_callback(int read_err, unsigned long write_err, void *context)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snap_pending_exception *pe = context;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct dm_snapshot *s = pe->snap;
|
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
pe->copy_error = read_err || write_err;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
if (pe->exception_sequence == s->exception_complete_sequence) {
|
2018-08-07 16:56:00 -04:00
|
|
|
struct rb_node *next;
|
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
s->exception_complete_sequence++;
|
|
|
|
complete_exception(pe);
|
|
|
|
|
2018-08-07 16:56:00 -04:00
|
|
|
next = rb_first(&s->out_of_order_tree);
|
|
|
|
while (next) {
|
|
|
|
pe = rb_entry(next, struct dm_snap_pending_exception,
|
|
|
|
out_of_order_node);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
if (pe->exception_sequence != s->exception_complete_sequence)
|
|
|
|
break;
|
2018-08-07 16:56:00 -04:00
|
|
|
next = rb_next(next);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
s->exception_complete_sequence++;
|
2018-08-07 16:56:00 -04:00
|
|
|
rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
complete_exception(pe);
|
2018-08-07 16:56:00 -04:00
|
|
|
cond_resched();
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
}
|
|
|
|
} else {
|
2018-08-07 16:56:00 -04:00
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct rb_node **p = &s->out_of_order_tree.rb_node;
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
struct dm_snap_pending_exception *pe2;
|
|
|
|
|
2018-08-07 16:56:00 -04:00
|
|
|
while (*p) {
|
|
|
|
pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
|
|
|
|
parent = *p;
|
|
|
|
|
|
|
|
BUG_ON(pe->exception_sequence == pe2->exception_sequence);
|
|
|
|
if (pe->exception_sequence < pe2->exception_sequence)
|
|
|
|
p = &((*p)->rb_left);
|
|
|
|
else
|
|
|
|
p = &((*p)->rb_right);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
}
|
2018-08-07 16:56:00 -04:00
|
|
|
|
|
|
|
rb_link_node(&pe->out_of_order_node, parent, p);
|
|
|
|
rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
}
|
2019-10-02 06:14:17 -04:00
|
|
|
account_end_copy(s);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dispatches the copy operation to kcopyd.
|
|
|
|
*/
|
2007-07-12 17:26:32 +01:00
|
|
|
static void start_copy(struct dm_snap_pending_exception *pe)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct dm_snapshot *s = pe->snap;
|
2008-04-24 21:43:17 +01:00
|
|
|
struct dm_io_region src, dest;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct block_device *bdev = s->origin->bdev;
|
|
|
|
sector_t dev_size;
|
|
|
|
|
|
|
|
dev_size = get_dev_size(bdev);
|
|
|
|
|
|
|
|
src.bdev = bdev;
|
2009-04-02 19:55:33 +01:00
|
|
|
src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
|
2009-10-16 23:18:17 +01:00
|
|
|
src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-12-10 23:52:12 +00:00
|
|
|
dest.bdev = s->cow->bdev;
|
2009-04-02 19:55:33 +01:00
|
|
|
dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
|
2005-04-16 15:20:36 -07:00
|
|
|
dest.count = src.count;
|
|
|
|
|
|
|
|
/* Hand over to kcopyd */
|
2019-10-02 06:14:17 -04:00
|
|
|
account_start_copy(s);
|
2011-08-02 12:32:03 +01:00
|
|
|
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2015-07-20 15:29:37 +02:00
|
|
|
static void full_bio_end_io(struct bio *bio)
|
2011-08-02 12:32:04 +01:00
|
|
|
{
|
|
|
|
void *callback_data = bio->bi_private;
|
|
|
|
|
2017-06-03 09:38:06 +02:00
|
|
|
dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
|
2011-08-02 12:32:04 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void start_full_bio(struct dm_snap_pending_exception *pe,
|
|
|
|
struct bio *bio)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s = pe->snap;
|
|
|
|
void *callback_data;
|
|
|
|
|
|
|
|
pe->full_bio = bio;
|
|
|
|
pe->full_bio_end_io = bio->bi_end_io;
|
|
|
|
|
2019-10-02 06:14:17 -04:00
|
|
|
account_start_copy(s);
|
2011-08-02 12:32:04 +01:00
|
|
|
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
|
|
|
|
copy_callback, pe);
|
|
|
|
|
|
|
|
bio->bi_end_io = full_bio_end_io;
|
|
|
|
bio->bi_private = callback_data;
|
|
|
|
|
|
|
|
generic_make_request(bio);
|
|
|
|
}
|
|
|
|
|
2009-04-02 19:55:25 +01:00
|
|
|
static struct dm_snap_pending_exception *
|
|
|
|
__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
|
|
|
|
{
|
2009-12-10 23:52:11 +00:00
|
|
|
struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
|
2009-04-02 19:55:25 +01:00
|
|
|
|
|
|
|
if (!e)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return container_of(e, struct dm_snap_pending_exception, e);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
* Inserts a pending exception into the pending table.
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
* NOTE: a write lock must be held on the chunk's pending exception table slot
|
|
|
|
* before calling this.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2007-07-12 17:26:32 +01:00
|
|
|
static struct dm_snap_pending_exception *
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
__insert_pending_exception(struct dm_snapshot *s,
|
|
|
|
struct dm_snap_pending_exception *pe, chunk_t chunk)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-03-27 01:17:45 -08:00
|
|
|
pe->e.old_chunk = chunk;
|
|
|
|
bio_list_init(&pe->origin_bios);
|
|
|
|
bio_list_init(&pe->snapshot_bios);
|
|
|
|
pe->started = 0;
|
2011-08-02 12:32:04 +01:00
|
|
|
pe->full_bio = NULL;
|
2006-03-27 01:17:45 -08:00
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
spin_lock(&s->pe_allocation_lock);
|
2009-04-02 19:55:31 +01:00
|
|
|
if (s->store->type->prepare_exception(s->store, &pe->e)) {
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
spin_unlock(&s->pe_allocation_lock);
|
2006-03-27 01:17:45 -08:00
|
|
|
free_pending_exception(pe);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
pe->exception_sequence = s->exception_start_sequence++;
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
spin_unlock(&s->pe_allocation_lock);
|
dm snapshot: avoid snapshot space leak on crash
There is a possible leak of snapshot space in case of crash.
The reason for space leaking is that chunks in the snapshot device are
allocated sequentially, but they are finished (and stored in the metadata)
out of order, depending on the order in which copying finished.
For example, supposed that the metadata contains the following records
SUPERBLOCK
METADATA (blocks 0 ... 250)
DATA 0
DATA 1
DATA 2
...
DATA 250
Now suppose that you allocate 10 new data blocks 251-260. Suppose that
copying of these blocks finish out of order (block 260 finished first
and the block 251 finished last). Now, the snapshot device looks like
this:
SUPERBLOCK
METADATA (blocks 0 ... 250, 260, 259, 258, 257, 256)
DATA 0
DATA 1
DATA 2
...
DATA 250
DATA 251
DATA 252
DATA 253
DATA 254
DATA 255
METADATA (blocks 255, 254, 253, 252, 251)
DATA 256
DATA 257
DATA 258
DATA 259
DATA 260
Now, if the machine crashes after writing the first metadata block but
before writing the second metadata block, the space for areas DATA 250-255
is leaked, it contains no valid data and it will never be used in the
future.
This patch makes dm-snapshot complete exceptions in the same order they
were allocated, thus fixing this bug.
Note: when backporting this patch to the stable kernel, change the version
field in the following way:
* if version in the stable kernel is {1, 11, 1}, change it to {1, 12, 0}
* if version in the stable kernel is {1, 10, 0} or {1, 10, 1}, change it
to {1, 10, 2}
Userspace reads the version to determine if the bug was fixed, so the
version change is needed.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2013-11-29 18:13:37 -05:00
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
dm_insert_exception(&s->pending, &pe->e);
|
2006-03-27 01:17:45 -08:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return pe;
|
|
|
|
}
|
|
|
|
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
/*
|
|
|
|
* Looks to see if this snapshot already has a pending exception
|
|
|
|
* for this chunk, otherwise it allocates a new one and inserts
|
|
|
|
* it into the pending table.
|
|
|
|
*
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
* NOTE: a write lock must be held on the chunk's pending exception table slot
|
|
|
|
* before calling this.
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
*/
|
|
|
|
static struct dm_snap_pending_exception *
|
|
|
|
__find_pending_exception(struct dm_snapshot *s,
|
|
|
|
struct dm_snap_pending_exception *pe, chunk_t chunk)
|
|
|
|
{
|
|
|
|
struct dm_snap_pending_exception *pe2;
|
|
|
|
|
|
|
|
pe2 = __lookup_pending_exception(s, chunk);
|
|
|
|
if (pe2) {
|
|
|
|
free_pending_exception(pe);
|
|
|
|
return pe2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return __insert_pending_exception(s, pe, chunk);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:10 +00:00
|
|
|
static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
|
2008-02-08 02:11:27 +00:00
|
|
|
struct bio *bio, chunk_t chunk)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->cow->bdev);
|
2013-10-11 15:44:27 -07:00
|
|
|
bio->bi_iter.bi_sector =
|
|
|
|
chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
|
|
|
|
(chunk - e->old_chunk)) +
|
|
|
|
(bio->bi_iter.bi_sector & s->store->chunk_mask);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
static void zero_callback(int read_err, unsigned long write_err, void *context)
|
|
|
|
{
|
|
|
|
struct bio *bio = context;
|
|
|
|
struct dm_snapshot *s = bio->bi_private;
|
|
|
|
|
2019-10-02 06:14:17 -04:00
|
|
|
account_end_copy(s);
|
2019-06-19 17:05:54 -04:00
|
|
|
bio->bi_status = write_err ? BLK_STS_IOERR : 0;
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
|
|
|
|
struct bio *bio, chunk_t chunk)
|
|
|
|
{
|
|
|
|
struct dm_io_region dest;
|
|
|
|
|
|
|
|
dest.bdev = s->cow->bdev;
|
|
|
|
dest.sector = bio->bi_iter.bi_sector;
|
|
|
|
dest.count = s->store->chunk_size;
|
|
|
|
|
2019-10-02 06:14:17 -04:00
|
|
|
account_start_copy(s);
|
2019-06-19 17:05:54 -04:00
|
|
|
WARN_ON_ONCE(bio->bi_private);
|
|
|
|
bio->bi_private = s;
|
|
|
|
dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
|
|
|
|
{
|
|
|
|
return bio->bi_iter.bi_size ==
|
|
|
|
(s->store->chunk_size << SECTOR_SHIFT);
|
|
|
|
}
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
static int snapshot_map(struct dm_target *ti, struct bio *bio)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snapshot *s = ti->private;
|
2006-12-08 02:41:06 -08:00
|
|
|
int r = DM_MAPIO_REMAPPED;
|
2005-04-16 15:20:36 -07:00
|
|
|
chunk_t chunk;
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snap_pending_exception *pe = NULL;
|
2019-03-17 14:22:57 +02:00
|
|
|
struct dm_exception_table_lock lock;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
init_tracked_chunk(bio);
|
|
|
|
|
2016-08-05 15:35:16 -06:00
|
|
|
if (bio->bi_opf & REQ_PREFLUSH) {
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->cow->bdev);
|
2009-06-22 10:12:25 +01:00
|
|
|
return DM_MAPIO_REMAPPED;
|
|
|
|
}
|
|
|
|
|
2013-10-11 15:44:27 -07:00
|
|
|
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock_init(s, chunk, &lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/* Full snapshots are not usable */
|
2006-03-27 01:17:45 -08:00
|
|
|
/* To get here the table must be live so s->active is always set. */
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!s->valid)
|
2017-06-03 09:38:02 +02:00
|
|
|
return DM_MAPIO_KILL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
if (bio_data_dir(bio) == WRITE) {
|
|
|
|
while (unlikely(!wait_for_in_progress(s, false)))
|
|
|
|
; /* wait_for_in_progress() has slept */
|
|
|
|
}
|
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
down_read(&s->lock);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
2006-10-03 01:15:28 -07:00
|
|
|
|
2016-07-19 11:28:41 +02:00
|
|
|
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
|
|
|
|
bio_data_dir(bio) == WRITE)) {
|
2017-06-03 09:38:02 +02:00
|
|
|
r = DM_MAPIO_KILL;
|
2006-10-03 01:15:28 -07:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
|
|
|
|
if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
|
|
|
|
/*
|
|
|
|
* passdown discard to origin (without triggering
|
|
|
|
* snapshot exceptions via do_origin; doing so would
|
|
|
|
* defeat the goal of freeing space in origin that is
|
|
|
|
* implied by the "discard_passdown_origin" feature)
|
|
|
|
*/
|
|
|
|
bio_set_dev(bio, s->origin->bdev);
|
|
|
|
track_chunk(s, bio, chunk);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
/* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
|
|
|
|
}
|
|
|
|
|
2006-10-03 01:15:28 -07:00
|
|
|
/* If the block is already remapped - use that, else remap it */
|
2009-12-10 23:52:11 +00:00
|
|
|
e = dm_lookup_exception(&s->complete, chunk);
|
2006-10-03 01:15:28 -07:00
|
|
|
if (e) {
|
2008-02-08 02:11:27 +00:00
|
|
|
remap_exception(s, e, bio, chunk);
|
2019-06-19 17:05:54 -04:00
|
|
|
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
|
|
|
|
io_overlaps_chunk(s, bio)) {
|
|
|
|
dm_exception_table_unlock(&lock);
|
|
|
|
up_read(&s->lock);
|
|
|
|
zero_exception(s, e, bio, chunk);
|
|
|
|
r = DM_MAPIO_SUBMITTED; /* discard is not issued */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
|
|
|
|
/*
|
|
|
|
* If no exception exists, complete discard immediately
|
|
|
|
* otherwise it'll trigger copy-out.
|
|
|
|
*/
|
|
|
|
bio_endio(bio);
|
|
|
|
r = DM_MAPIO_SUBMITTED;
|
2006-10-03 01:15:28 -07:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Write to snapshot - higher level takes care of RW/RO
|
|
|
|
* flags so we should only get this if we are
|
|
|
|
* writeable.
|
|
|
|
*/
|
2016-07-19 11:28:41 +02:00
|
|
|
if (bio_data_dir(bio) == WRITE) {
|
2009-04-02 19:55:25 +01:00
|
|
|
pe = __lookup_pending_exception(s, chunk);
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!pe) {
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
2009-04-02 19:55:25 +01:00
|
|
|
pe = alloc_pending_exception(s);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
2009-04-02 19:55:25 +01:00
|
|
|
|
2009-12-10 23:52:11 +00:00
|
|
|
e = dm_lookup_exception(&s->complete, chunk);
|
2009-04-02 19:55:26 +01:00
|
|
|
if (e) {
|
|
|
|
free_pending_exception(pe);
|
|
|
|
remap_exception(s, e, bio, chunk);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2009-04-02 19:55:25 +01:00
|
|
|
pe = __find_pending_exception(s, pe, chunk);
|
2009-04-02 19:55:25 +01:00
|
|
|
if (!pe) {
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
|
|
|
|
|
|
|
down_write(&s->lock);
|
2019-03-17 14:22:57 +02:00
|
|
|
|
2015-10-08 18:05:41 -04:00
|
|
|
if (s->store->userspace_supports_overflow) {
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
if (s->valid && !s->snapshot_overflowed) {
|
|
|
|
s->snapshot_overflowed = 1;
|
|
|
|
DMERR("Snapshot overflowed: Unable to allocate exception.");
|
|
|
|
}
|
2015-10-08 18:05:41 -04:00
|
|
|
} else
|
|
|
|
__invalidate_snapshot(s, -ENOMEM);
|
2019-03-17 14:22:57 +02:00
|
|
|
up_write(&s->lock);
|
|
|
|
|
2017-06-03 09:38:02 +02:00
|
|
|
r = DM_MAPIO_KILL;
|
2019-03-17 14:22:57 +02:00
|
|
|
goto out;
|
2009-04-02 19:55:25 +01:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-02-08 02:11:27 +00:00
|
|
|
remap_exception(s, &pe->e, bio, chunk);
|
2006-03-27 01:17:45 -08:00
|
|
|
|
2006-12-08 02:41:06 -08:00
|
|
|
r = DM_MAPIO_SUBMITTED;
|
2006-10-03 01:15:28 -07:00
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
if (!pe->started && io_overlaps_chunk(s, bio)) {
|
2011-08-02 12:32:04 +01:00
|
|
|
pe->started = 1;
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
|
|
|
|
2011-08-02 12:32:04 +01:00
|
|
|
start_full_bio(pe, bio);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
bio_list_add(&pe->snapshot_bios, bio);
|
|
|
|
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!pe->started) {
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
/* this is protected by the exception table lock */
|
2006-03-27 01:17:45 -08:00
|
|
|
pe->started = 1;
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
|
|
|
|
2006-03-27 01:17:45 -08:00
|
|
|
start_copy(pe);
|
2006-10-03 01:15:28 -07:00
|
|
|
goto out;
|
|
|
|
}
|
2008-07-21 12:00:32 +01:00
|
|
|
} else {
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->origin->bdev);
|
2012-12-21 20:23:41 +00:00
|
|
|
track_chunk(s, bio, chunk);
|
2008-07-21 12:00:32 +01:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2011-08-02 12:32:03 +01:00
|
|
|
out_unlock:
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&s->lock);
|
2011-08-02 12:32:03 +01:00
|
|
|
out:
|
2005-04-16 15:20:36 -07:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:31 +00:00
|
|
|
/*
|
|
|
|
* A snapshot-merge target behaves like a combination of a snapshot
|
|
|
|
* target and a snapshot-origin target. It only generates new
|
|
|
|
* exceptions in other snapshots and not in the one that is being
|
|
|
|
* merged.
|
|
|
|
*
|
|
|
|
* For each chunk, if there is an existing exception, it is used to
|
|
|
|
* redirect I/O to the cow device. Otherwise I/O is sent to the origin,
|
|
|
|
* which in turn might generate exceptions in other snapshots.
|
2009-12-10 23:52:33 +00:00
|
|
|
* If merging is currently taking place on the chunk in question, the
|
|
|
|
* I/O is deferred by adding it to s->bios_queued_during_merge.
|
2009-12-10 23:52:31 +00:00
|
|
|
*/
|
2012-12-21 20:23:41 +00:00
|
|
|
static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
|
2009-12-10 23:52:31 +00:00
|
|
|
{
|
|
|
|
struct dm_exception *e;
|
|
|
|
struct dm_snapshot *s = ti->private;
|
|
|
|
int r = DM_MAPIO_REMAPPED;
|
|
|
|
chunk_t chunk;
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
init_tracked_chunk(bio);
|
|
|
|
|
2016-08-05 15:35:16 -06:00
|
|
|
if (bio->bi_opf & REQ_PREFLUSH) {
|
2013-03-01 22:45:47 +00:00
|
|
|
if (!dm_bio_get_target_bio_nr(bio))
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->origin->bdev);
|
2009-12-10 23:52:31 +00:00
|
|
|
else
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->cow->bdev);
|
2009-12-10 23:52:31 +00:00
|
|
|
return DM_MAPIO_REMAPPED;
|
|
|
|
}
|
|
|
|
|
2019-07-17 11:12:30 -04:00
|
|
|
if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
|
|
|
|
/* Once merging, discards no longer effect change */
|
|
|
|
bio_endio(bio);
|
|
|
|
return DM_MAPIO_SUBMITTED;
|
|
|
|
}
|
|
|
|
|
2013-10-11 15:44:27 -07:00
|
|
|
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
|
2009-12-10 23:52:31 +00:00
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2009-12-10 23:52:31 +00:00
|
|
|
|
2009-12-10 23:52:36 +00:00
|
|
|
/* Full merging snapshots are redirected to the origin */
|
|
|
|
if (!s->valid)
|
|
|
|
goto redirect_to_origin;
|
2009-12-10 23:52:31 +00:00
|
|
|
|
|
|
|
/* If the block is already remapped - use that */
|
|
|
|
e = dm_lookup_exception(&s->complete, chunk);
|
|
|
|
if (e) {
|
2009-12-10 23:52:33 +00:00
|
|
|
/* Queue writes overlapping with chunks being merged */
|
2016-07-19 11:28:41 +02:00
|
|
|
if (bio_data_dir(bio) == WRITE &&
|
2009-12-10 23:52:33 +00:00
|
|
|
chunk >= s->first_merging_chunk &&
|
|
|
|
chunk < (s->first_merging_chunk +
|
|
|
|
s->num_merging_chunks)) {
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->origin->bdev);
|
2009-12-10 23:52:33 +00:00
|
|
|
bio_list_add(&s->bios_queued_during_merge, bio);
|
|
|
|
r = DM_MAPIO_SUBMITTED;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2009-12-10 23:52:33 +00:00
|
|
|
|
2009-12-10 23:52:31 +00:00
|
|
|
remap_exception(s, e, bio, chunk);
|
2009-12-10 23:52:33 +00:00
|
|
|
|
2016-07-19 11:28:41 +02:00
|
|
|
if (bio_data_dir(bio) == WRITE)
|
2012-12-21 20:23:41 +00:00
|
|
|
track_chunk(s, bio, chunk);
|
2009-12-10 23:52:31 +00:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:36 +00:00
|
|
|
redirect_to_origin:
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, s->origin->bdev);
|
2009-12-10 23:52:31 +00:00
|
|
|
|
2016-07-19 11:28:41 +02:00
|
|
|
if (bio_data_dir(bio) == WRITE) {
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
return do_origin(s->origin, bio, false);
|
2009-12-10 23:52:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
out_unlock:
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2009-12-10 23:52:31 +00:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2017-06-03 09:38:06 +02:00
|
|
|
static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
|
|
|
|
blk_status_t *error)
|
2008-07-21 12:00:32 +01:00
|
|
|
{
|
|
|
|
struct dm_snapshot *s = ti->private;
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
if (is_bio_tracked(bio))
|
|
|
|
stop_tracking_chunk(s, bio);
|
2008-07-21 12:00:32 +01:00
|
|
|
|
2017-06-03 09:38:03 +02:00
|
|
|
return DM_ENDIO_DONE;
|
2008-07-21 12:00:32 +01:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
static void snapshot_merge_presuspend(struct dm_target *ti)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s = ti->private;
|
|
|
|
|
|
|
|
stop_merge(s);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
static int snapshot_preresume(struct dm_target *ti)
|
|
|
|
{
|
|
|
|
int r = 0;
|
|
|
|
struct dm_snapshot *s = ti->private;
|
|
|
|
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
|
|
|
|
|
|
|
|
down_read(&_origins_lock);
|
2009-12-10 23:52:32 +00:00
|
|
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
2009-12-10 23:52:24 +00:00
|
|
|
if (snap_src && snap_dest) {
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_read(&snap_src->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
if (s == snap_src) {
|
|
|
|
DMERR("Unable to resume snapshot source until "
|
|
|
|
"handover completes.");
|
|
|
|
r = -EINVAL;
|
2011-01-13 19:59:59 +00:00
|
|
|
} else if (!dm_suspended(snap_src->ti)) {
|
2009-12-10 23:52:24 +00:00
|
|
|
DMERR("Unable to perform snapshot handover until "
|
|
|
|
"source is suspended.");
|
|
|
|
r = -EINVAL;
|
|
|
|
}
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_read(&snap_src->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
}
|
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static void snapshot_resume(struct dm_target *ti)
|
|
|
|
{
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snapshot *s = ti->private;
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
|
2015-02-26 11:40:35 -05:00
|
|
|
struct dm_origin *o;
|
|
|
|
struct mapped_device *origin_md = NULL;
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
bool must_restart_merging = false;
|
2009-12-10 23:52:24 +00:00
|
|
|
|
|
|
|
down_read(&_origins_lock);
|
2015-02-26 11:40:35 -05:00
|
|
|
|
|
|
|
o = __lookup_dm_origin(s->origin->bdev);
|
|
|
|
if (o)
|
|
|
|
origin_md = dm_table_get_md(o->ti->table);
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
if (!origin_md) {
|
|
|
|
(void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
|
|
|
|
if (snap_merging)
|
|
|
|
origin_md = dm_table_get_md(snap_merging->ti->table);
|
|
|
|
}
|
2015-02-26 11:40:35 -05:00
|
|
|
if (origin_md == dm_table_get_md(ti->table))
|
|
|
|
origin_md = NULL;
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
if (origin_md) {
|
|
|
|
if (dm_hold(origin_md))
|
|
|
|
origin_md = NULL;
|
|
|
|
}
|
2015-02-26 11:40:35 -05:00
|
|
|
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
|
|
|
if (origin_md) {
|
2015-02-26 11:40:35 -05:00
|
|
|
dm_internal_suspend_fast(origin_md);
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
|
|
|
|
must_restart_merging = true;
|
|
|
|
stop_merge(snap_merging);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
down_read(&_origins_lock);
|
2015-02-26 11:40:35 -05:00
|
|
|
|
2009-12-10 23:52:32 +00:00
|
|
|
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
|
2009-12-10 23:52:24 +00:00
|
|
|
if (snap_src && snap_dest) {
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&snap_src->lock);
|
|
|
|
down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
|
2009-12-10 23:52:24 +00:00
|
|
|
__handover_exceptions(snap_src, snap_dest);
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&snap_dest->lock);
|
|
|
|
up_write(&snap_src->lock);
|
2009-12-10 23:52:24 +00:00
|
|
|
}
|
2015-02-26 11:40:35 -05:00
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
dm snapshot: suspend merging snapshot when doing exception handover
The "dm snapshot: suspend origin when doing exception handover" commit
fixed a exception store handover bug associated with pending exceptions
to the "snapshot-origin" target.
However, a similar problem exists in snapshot merging. When snapshot
merging is in progress, we use the target "snapshot-merge" instead of
"snapshot-origin". Consequently, during exception store handover, we
must find the snapshot-merge target and suspend its associated
mapped_device.
To avoid lockdep warnings, the target must be suspended and resumed
without holding _origins_lock.
Introduce a dm_hold() function that grabs a reference on a
mapped_device, but unlike dm_get(), it doesn't crash if the device has
the DMF_FREEING flag set, it returns an error in this case.
In snapshot_resume() we grab the reference to the origin device using
dm_hold() while holding _origins_lock (_origins_lock guarantees that the
device won't disappear). Then we release _origins_lock, suspend the
device and grab _origins_lock again.
NOTE to stable@ people:
When backporting to kernels 3.18 and older, use dm_internal_suspend and
dm_internal_resume instead of dm_internal_suspend_fast and
dm_internal_resume_fast.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
2015-02-26 11:41:28 -05:00
|
|
|
if (origin_md) {
|
|
|
|
if (must_restart_merging)
|
|
|
|
start_merge(snap_merging);
|
|
|
|
dm_internal_resume_fast(origin_md);
|
|
|
|
dm_put(origin_md);
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:24 +00:00
|
|
|
/* Now we have correct chunk size, reregister */
|
|
|
|
reregister_snapshot(s);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&s->lock);
|
2006-02-01 03:04:50 -08:00
|
|
|
s->active = 1;
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&s->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2012-07-27 15:08:00 +01:00
|
|
|
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
|
2009-12-10 23:52:32 +00:00
|
|
|
{
|
2012-07-27 15:08:00 +01:00
|
|
|
uint32_t min_chunksize;
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
down_read(&_origins_lock);
|
|
|
|
min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
|
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
|
|
|
return min_chunksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void snapshot_merge_resume(struct dm_target *ti)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *s = ti->private;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handover exceptions from existing snapshot.
|
|
|
|
*/
|
|
|
|
snapshot_resume(ti);
|
|
|
|
|
|
|
|
/*
|
2012-07-27 15:08:00 +01:00
|
|
|
* snapshot-merge acts as an origin, so set ti->max_io_len
|
2009-12-10 23:52:32 +00:00
|
|
|
*/
|
2012-07-27 15:08:00 +01:00
|
|
|
ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
|
2009-12-10 23:52:32 +00:00
|
|
|
|
|
|
|
start_merge(s);
|
|
|
|
}
|
|
|
|
|
2013-03-01 22:45:44 +00:00
|
|
|
static void snapshot_status(struct dm_target *ti, status_type_t type,
|
|
|
|
unsigned status_flags, char *result, unsigned maxlen)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-04-02 19:55:34 +01:00
|
|
|
unsigned sz = 0;
|
2007-07-12 17:26:32 +01:00
|
|
|
struct dm_snapshot *snap = ti->private;
|
2019-06-19 17:05:54 -04:00
|
|
|
unsigned num_features;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case STATUSTYPE_INFO:
|
2009-12-10 23:51:53 +00:00
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
down_write(&snap->lock);
|
2009-12-10 23:51:53 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!snap->valid)
|
2009-04-02 19:55:34 +01:00
|
|
|
DMEMIT("Invalid");
|
2009-12-10 23:52:35 +00:00
|
|
|
else if (snap->merge_failed)
|
|
|
|
DMEMIT("Merge failed");
|
2015-06-21 16:31:33 -04:00
|
|
|
else if (snap->snapshot_overflowed)
|
|
|
|
DMEMIT("Overflow");
|
2005-04-16 15:20:36 -07:00
|
|
|
else {
|
2009-12-10 23:52:11 +00:00
|
|
|
if (snap->store->type->usage) {
|
|
|
|
sector_t total_sectors, sectors_allocated,
|
|
|
|
metadata_sectors;
|
|
|
|
snap->store->type->usage(snap->store,
|
|
|
|
&total_sectors,
|
|
|
|
§ors_allocated,
|
|
|
|
&metadata_sectors);
|
|
|
|
DMEMIT("%llu/%llu %llu",
|
|
|
|
(unsigned long long)sectors_allocated,
|
|
|
|
(unsigned long long)total_sectors,
|
|
|
|
(unsigned long long)metadata_sectors);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
else
|
2009-04-02 19:55:34 +01:00
|
|
|
DMEMIT("Unknown");
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2009-12-10 23:51:53 +00:00
|
|
|
|
dm snapshot: Replace mutex with rw semaphore
dm-snapshot uses a single mutex to serialize every access to the
snapshot state. This includes all accesses to the complete and pending
exception tables, which occur at every origin write, every snapshot
read/write and every exception completion.
The lock statistics indicate that this mutex is a bottleneck (average
wait time ~480 usecs for 8 processes doing random 4K writes to the
origin device) preventing dm-snapshot to scale as the number of threads
doing IO increases.
The major contention points are __origin_write()/snapshot_map() and
pending_complete(), i.e., the submission and completion of pending
exceptions.
Replace this mutex with a rw semaphore.
We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex
instead of rw_semaphore") and together with the next two patches we
substitute the single mutex with a fine-grained locking scheme, where we
use a read-write semaphore to protect the mostly read fields of the
snapshot structure, e.g., valid, active, etc., and per-bucket bit
spinlocks to protect accesses to the complete and pending exception
tables.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:56 +02:00
|
|
|
up_write(&snap->lock);
|
2009-12-10 23:51:53 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUSTYPE_TABLE:
|
|
|
|
/*
|
|
|
|
* kdevname returns a static pointer so we need
|
|
|
|
* to make private copies if the output is to
|
|
|
|
* make sense.
|
|
|
|
*/
|
2009-12-10 23:52:12 +00:00
|
|
|
DMEMIT("%s %s", snap->origin->name, snap->cow->name);
|
2019-06-19 17:05:54 -04:00
|
|
|
sz += snap->store->type->status(snap->store, type, result + sz,
|
|
|
|
maxlen - sz);
|
|
|
|
num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
|
|
|
|
if (num_features) {
|
|
|
|
DMEMIT(" %u", num_features);
|
|
|
|
if (snap->discard_zeroes_cow)
|
|
|
|
DMEMIT(" discard_zeroes_cow");
|
|
|
|
if (snap->discard_passdown_origin)
|
|
|
|
DMEMIT(" discard_passdown_origin");
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-04 20:40:19 +01:00
|
|
|
static int snapshot_iterate_devices(struct dm_target *ti,
|
|
|
|
iterate_devices_callout_fn fn, void *data)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *snap = ti->private;
|
2010-08-12 04:13:50 +01:00
|
|
|
int r;
|
|
|
|
|
|
|
|
r = fn(ti, snap->origin, 0, ti->len, data);
|
2009-09-04 20:40:19 +01:00
|
|
|
|
2010-08-12 04:13:50 +01:00
|
|
|
if (!r)
|
|
|
|
r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
|
|
|
|
|
|
|
|
return r;
|
2009-09-04 20:40:19 +01:00
|
|
|
}
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|
|
|
{
|
|
|
|
struct dm_snapshot *snap = ti->private;
|
|
|
|
|
|
|
|
if (snap->discard_zeroes_cow) {
|
|
|
|
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
|
|
|
|
|
2019-07-17 11:12:30 -04:00
|
|
|
down_read(&_origins_lock);
|
|
|
|
|
2019-06-19 17:05:54 -04:00
|
|
|
(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
|
|
|
|
if (snap_src && snap_dest)
|
|
|
|
snap = snap_src;
|
|
|
|
|
|
|
|
/* All discards are split on chunk_size boundary */
|
|
|
|
limits->discard_granularity = snap->store->chunk_size;
|
|
|
|
limits->max_discard_sectors = snap->store->chunk_size;
|
2019-07-17 11:12:30 -04:00
|
|
|
|
|
|
|
up_read(&_origins_lock);
|
2019-06-19 17:05:54 -04:00
|
|
|
}
|
|
|
|
}
|
2009-09-04 20:40:19 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*-----------------------------------------------------------------
|
|
|
|
* Origin methods
|
|
|
|
*---------------------------------------------------------------*/
|
2009-12-10 23:52:28 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
|
|
|
|
* supplied bio was ignored. The caller may submit it immediately.
|
|
|
|
* (No remapping actually occurs as the origin is always a direct linear
|
|
|
|
* map.)
|
|
|
|
*
|
|
|
|
* If further exceptions are required, DM_MAPIO_SUBMITTED is returned
|
|
|
|
* and any supplied bio is added to a list to be submitted once all
|
|
|
|
* the necessary exceptions exist.
|
|
|
|
*/
|
|
|
|
static int __origin_write(struct list_head *snapshots, sector_t sector,
|
|
|
|
struct bio *bio)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2009-12-10 23:52:30 +00:00
|
|
|
int r = DM_MAPIO_REMAPPED;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct dm_snapshot *snap;
|
2009-12-10 23:52:10 +00:00
|
|
|
struct dm_exception *e;
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
struct dm_snap_pending_exception *pe, *pe2;
|
2009-12-10 23:52:30 +00:00
|
|
|
struct dm_snap_pending_exception *pe_to_start_now = NULL;
|
|
|
|
struct dm_snap_pending_exception *pe_to_start_last = NULL;
|
2019-03-17 14:22:57 +02:00
|
|
|
struct dm_exception_table_lock lock;
|
2005-04-16 15:20:36 -07:00
|
|
|
chunk_t chunk;
|
|
|
|
|
|
|
|
/* Do all the snapshots on this origin */
|
|
|
|
list_for_each_entry (snap, snapshots, list) {
|
2009-12-10 23:52:31 +00:00
|
|
|
/*
|
|
|
|
* Don't make new exceptions in a merging snapshot
|
|
|
|
* because it has effectively been deleted
|
|
|
|
*/
|
|
|
|
if (dm_target_is_snapshot_merge(snap->ti))
|
|
|
|
continue;
|
|
|
|
|
2005-07-12 15:53:05 -07:00
|
|
|
/* Nothing to do if writing beyond end of snapshot */
|
2009-12-10 23:52:28 +00:00
|
|
|
if (sector >= dm_table_get_size(snap->ti->table))
|
2019-03-17 14:22:57 +02:00
|
|
|
continue;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Remember, different snapshots can have
|
|
|
|
* different chunk sizes.
|
|
|
|
*/
|
2009-12-10 23:52:28 +00:00
|
|
|
chunk = sector_to_chunk(snap->store, sector);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock_init(snap, chunk, &lock);
|
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
down_read(&snap->lock);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
|
|
|
|
|
|
|
/* Only deal with valid and active snapshots */
|
|
|
|
if (!snap->valid || !snap->active)
|
|
|
|
goto next_snapshot;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-04-02 19:55:25 +01:00
|
|
|
pe = __lookup_pending_exception(snap, chunk);
|
2006-03-27 01:17:45 -08:00
|
|
|
if (!pe) {
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
/*
|
|
|
|
* Check exception table to see if block is already
|
|
|
|
* remapped in this snapshot and trigger an exception
|
|
|
|
* if not.
|
|
|
|
*/
|
|
|
|
e = dm_lookup_exception(&snap->complete, chunk);
|
|
|
|
if (e)
|
|
|
|
goto next_snapshot;
|
|
|
|
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
2009-04-02 19:55:25 +01:00
|
|
|
pe = alloc_pending_exception(snap);
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_lock(&lock);
|
2009-04-02 19:55:25 +01:00
|
|
|
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
pe2 = __lookup_pending_exception(snap, chunk);
|
|
|
|
|
|
|
|
if (!pe2) {
|
|
|
|
e = dm_lookup_exception(&snap->complete, chunk);
|
|
|
|
if (e) {
|
|
|
|
free_pending_exception(pe);
|
|
|
|
goto next_snapshot;
|
|
|
|
}
|
|
|
|
|
|
|
|
pe = __insert_pending_exception(snap, pe, chunk);
|
|
|
|
if (!pe) {
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&snap->lock);
|
2019-03-17 14:22:57 +02:00
|
|
|
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
invalidate_snapshot(snap, -ENOMEM);
|
2019-03-17 14:22:57 +02:00
|
|
|
continue;
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
}
|
|
|
|
} else {
|
2009-04-02 19:55:26 +01:00
|
|
|
free_pending_exception(pe);
|
dm snapshot: Don't sleep holding the snapshot lock
When completing a pending exception, pending_complete() waits for all
conflicting reads to drain, before inserting the final, completed
exception. Conflicting reads are snapshot reads redirected to the
origin, because the relevant chunk is not remapped to the COW device the
moment we receive the read.
The completed exception must be inserted into the exception table after
all conflicting reads drain to ensure snapshot reads don't return
corrupted data. This is required because inserting the completed
exception into the exception table signals that the relevant chunk is
remapped and both origin writes and snapshot merging will now overwrite
the chunk in origin.
This wait is done holding the snapshot lock to ensure that
pending_complete() doesn't starve if new snapshot reads keep coming for
this chunk.
In preparation for the next commit, where we use a spinlock instead of a
mutex to protect the exception tables, we remove the need for holding
the lock while waiting for conflicting reads to drain.
We achieve this in two steps:
1. pending_complete() inserts the completed exception before waiting for
conflicting reads to drain and removes the pending exception after
all conflicting reads drain.
This ensures that new snapshot reads will be redirected to the COW
device, instead of the origin, and thus pending_complete() will not
starve. Moreover, we use the existence of both a completed and
a pending exception to signify that the COW is done but there are
conflicting reads in flight.
2. In __origin_write() we check first if there is a pending exception
and then if there is a completed exception. If there is a pending
exception any submitted BIO is delayed on the pe->origin_bios list and
DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the
origin nor snapshot merging can overwrite the origin chunk, until all
conflicting reads drain, and thus snapshot reads will not return
corrupted data.
Summarizing, we now have the following possible combinations of pending
and completed exceptions for a chunk, along with their meaning:
A. No exceptions exist: The chunk has not been remapped yet.
B. Only a pending exception exists: The chunk is currently being copied
to the COW device.
C. Both a pending and a completed exception exist: COW for this chunk
has completed but there are snapshot reads in flight which had been
redirected to the origin before the chunk was remapped.
D. Only the completed exception exists: COW has been completed and there
are no conflicting reads in flight.
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:55 +02:00
|
|
|
pe = pe2;
|
2009-04-02 19:55:25 +01:00
|
|
|
}
|
2006-03-27 01:17:45 -08:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
r = DM_MAPIO_SUBMITTED;
|
2006-03-27 01:17:45 -08:00
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
/*
|
|
|
|
* If an origin bio was supplied, queue it to wait for the
|
|
|
|
* completion of this exception, and start this one last,
|
|
|
|
* at the end of the function.
|
|
|
|
*/
|
|
|
|
if (bio) {
|
|
|
|
bio_list_add(&pe->origin_bios, bio);
|
|
|
|
bio = NULL;
|
2006-03-27 01:17:45 -08:00
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
if (!pe->started) {
|
|
|
|
pe->started = 1;
|
|
|
|
pe_to_start_last = pe;
|
|
|
|
}
|
2006-03-27 01:17:45 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!pe->started) {
|
|
|
|
pe->started = 1;
|
2009-12-10 23:52:30 +00:00
|
|
|
pe_to_start_now = pe;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2011-08-02 12:32:03 +01:00
|
|
|
next_snapshot:
|
2019-03-17 14:22:57 +02:00
|
|
|
dm_exception_table_unlock(&lock);
|
dm snapshot: Use fine-grained locking scheme
Substitute the global locking scheme with a fine grained one, employing
the read-write semaphore and the scalable exception tables with
per-bucket locks introduced by the previous two commits.
Summarizing, we now use a read-write semaphore to protect the mostly
read fields of the snapshot structure, e.g., valid, active, etc., and
per-bucket bit spinlocks to protect accesses to the complete and pending
exception tables.
Finally, we use an extra spinlock (pe_allocation_lock) to serialize the
allocation of new exceptions by the exception store. This allocation is
really fast, so the extra spinlock doesn't hurt the performance.
This scheme allows dm-snapshot to scale better, resulting in increased
IOPS and reduced latency.
Following are some benchmark results using the null_blk device:
modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \
queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1
* Benchmark fio_origin_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to origin device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 57708 | 66421 |
| 2 | 63415 | 77589 |
| 4 | 67276 | 98839 |
| 8 | 60564 | 109258 |
+--------------+-------------+------------+
* Benchmark fio_origin_randwrite_latency_N, from the device mapper test
suite [1] (direct IO, random 4K writes to origin device, IO engine
psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 16.25 | 13.27 |
| 2 | 31.65 | 25.08 |
| 4 | 55.28 | 41.08 |
| 8 | 121.47 | 74.44 |
+--------------+-----------------------+----------------------+
* Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine libaio):
+--------------+-------------+------------+
| # of workers | IOPS Before | IOPS After |
+--------------+-------------+------------+
| 1 | 72593 | 84938 |
| 2 | 97379 | 134973 |
| 4 | 90610 | 143077 |
| 8 | 90537 | 180085 |
+--------------+-------------+------------+
* Benchmark fio_snapshot_randwrite_latency_N, from the device mapper
test suite [1] (direct IO, random 4K writes to snapshot device, IO
engine psync):
+--------------+-----------------------+----------------------+
| # of workers | Latency (usec) Before | Latency (usec) After |
+--------------+-----------------------+----------------------+
| 1 | 12.53 | 10.6 |
| 2 | 19.78 | 14.89 |
| 4 | 40.37 | 23.47 |
| 8 | 89.32 | 48.48 |
+--------------+-----------------------+----------------------+
[1] https://github.com/jthornber/device-mapper-test-suite
Co-developed-by: Ilias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-03-17 14:22:58 +02:00
|
|
|
up_read(&snap->lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
if (pe_to_start_now) {
|
|
|
|
start_copy(pe_to_start_now);
|
|
|
|
pe_to_start_now = NULL;
|
|
|
|
}
|
2006-03-27 01:17:44 -08:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2009-12-10 23:52:30 +00:00
|
|
|
* Submit the exception against which the bio is queued last,
|
|
|
|
* to give the other exceptions a head start.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2009-12-10 23:52:30 +00:00
|
|
|
if (pe_to_start_last)
|
|
|
|
start_copy(pe_to_start_last);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called on a write from the origin driver.
|
|
|
|
*/
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct origin *o;
|
2006-12-08 02:41:06 -08:00
|
|
|
int r = DM_MAPIO_REMAPPED;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
again:
|
2005-04-16 15:20:36 -07:00
|
|
|
down_read(&_origins_lock);
|
|
|
|
o = __lookup_origin(origin->bdev);
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
if (o) {
|
|
|
|
if (limit) {
|
|
|
|
struct dm_snapshot *s;
|
|
|
|
list_for_each_entry(s, &o->snapshots, list)
|
|
|
|
if (unlikely(!wait_for_in_progress(s, true)))
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2013-10-11 15:44:27 -07:00
|
|
|
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:34 +00:00
|
|
|
/*
|
|
|
|
* Trigger exceptions in all non-merging snapshots.
|
|
|
|
*
|
|
|
|
* The chunk size of the merging snapshot may be larger than the chunk
|
|
|
|
* size of some other snapshot so we may need to reallocate multiple
|
|
|
|
* chunks in other snapshots.
|
|
|
|
*
|
|
|
|
* We scan all the overlapping exceptions in the other snapshots.
|
|
|
|
* Returns 1 if anything was reallocated and must be waited for,
|
|
|
|
* otherwise returns 0.
|
|
|
|
*
|
|
|
|
* size must be a multiple of merging_snap's chunk_size.
|
|
|
|
*/
|
|
|
|
static int origin_write_extent(struct dm_snapshot *merging_snap,
|
|
|
|
sector_t sector, unsigned size)
|
|
|
|
{
|
|
|
|
int must_wait = 0;
|
|
|
|
sector_t n;
|
|
|
|
struct origin *o;
|
|
|
|
|
|
|
|
/*
|
2012-07-27 15:08:00 +01:00
|
|
|
* The origin's __minimum_chunk_size() got stored in max_io_len
|
2009-12-10 23:52:34 +00:00
|
|
|
* by snapshot_merge_resume().
|
|
|
|
*/
|
|
|
|
down_read(&_origins_lock);
|
|
|
|
o = __lookup_origin(merging_snap->origin->bdev);
|
2012-07-27 15:08:00 +01:00
|
|
|
for (n = 0; n < size; n += merging_snap->ti->max_io_len)
|
2009-12-10 23:52:34 +00:00
|
|
|
if (__origin_write(&o->snapshots, sector + n, NULL) ==
|
|
|
|
DM_MAPIO_SUBMITTED)
|
|
|
|
must_wait = 1;
|
|
|
|
up_read(&_origins_lock);
|
|
|
|
|
|
|
|
return must_wait;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Origin: maps a linear range of a device, with hooks for snapshotting.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Construct an origin mapping: <dev_path>
|
|
|
|
* The context for an origin is merely a 'struct dm_dev *'
|
|
|
|
* pointing to the real device.
|
|
|
|
*/
|
|
|
|
static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
|
|
{
|
|
|
|
int r;
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (argc != 1) {
|
2006-06-26 00:27:35 -07:00
|
|
|
ti->error = "origin: incorrect number of arguments";
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2014-03-14 18:42:12 -04:00
|
|
|
o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
|
|
|
|
if (!o) {
|
|
|
|
ti->error = "Cannot allocate private origin structure";
|
|
|
|
r = -ENOMEM;
|
|
|
|
goto bad_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (r) {
|
|
|
|
ti->error = "Cannot get target device";
|
2014-03-14 18:42:12 -04:00
|
|
|
goto bad_open;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2015-02-26 11:40:35 -05:00
|
|
|
o->ti = ti;
|
2014-03-14 18:42:12 -04:00
|
|
|
ti->private = o;
|
2013-03-01 22:45:47 +00:00
|
|
|
ti->num_flush_bios = 1;
|
2009-06-22 10:12:25 +01:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
2014-03-14 18:42:12 -04:00
|
|
|
|
|
|
|
bad_open:
|
|
|
|
kfree(o);
|
|
|
|
bad_alloc:
|
|
|
|
return r;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void origin_dtr(struct dm_target *ti)
|
|
|
|
{
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o = ti->private;
|
2015-02-26 11:40:35 -05:00
|
|
|
|
2014-03-14 18:42:12 -04:00
|
|
|
dm_put_device(ti, o->dev);
|
|
|
|
kfree(o);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2012-12-21 20:23:41 +00:00
|
|
|
static int origin_map(struct dm_target *ti, struct bio *bio)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o = ti->private;
|
2014-03-14 18:43:07 -04:00
|
|
|
unsigned available_sectors;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2017-08-23 19:10:32 +02:00
|
|
|
bio_set_dev(bio, o->dev->bdev);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2016-08-05 15:35:16 -06:00
|
|
|
if (unlikely(bio->bi_opf & REQ_PREFLUSH))
|
2009-06-22 10:12:25 +01:00
|
|
|
return DM_MAPIO_REMAPPED;
|
|
|
|
|
2016-07-19 11:28:41 +02:00
|
|
|
if (bio_data_dir(bio) != WRITE)
|
2009-06-22 10:12:25 +01:00
|
|
|
return DM_MAPIO_REMAPPED;
|
|
|
|
|
2014-03-14 18:43:07 -04:00
|
|
|
available_sectors = o->split_boundary -
|
|
|
|
((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
|
|
|
|
|
|
|
|
if (bio_sectors(bio) > available_sectors)
|
|
|
|
dm_accept_partial_bio(bio, available_sectors);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/* Only tell snapshots if this is a write */
|
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d98fb517a ("dm snapshot: Fix excessive memory usage and
workqueue stalls") introduced a semaphore to limit the maximum number of
in-flight kcopyd (COW) jobs.
The implementation of this throttling mechanism is prone to a deadlock:
1. One or more threads write to the origin device causing COW, which is
performed by kcopyd.
2. At some point some of these threads might reach the s->cow_count
semaphore limit and block in down(&s->cow_count), holding a read lock
on _origins_lock.
3. Someone tries to acquire a write lock on _origins_lock, e.g.,
snapshot_ctr(), which blocks because the threads at step (2) already
hold a read lock on it.
4. A COW operation completes and kcopyd runs dm-snapshot's completion
callback, which ends up calling pending_complete().
pending_complete() tries to resubmit any deferred origin bios. This
requires acquiring a read lock on _origins_lock, which blocks.
This happens because the read-write semaphore implementation gives
priority to writers, meaning that as soon as a writer tries to enter
the critical section, no readers will be allowed in, until all
writers have completed their work.
So, pending_complete() waits for the writer at step (3) to acquire
and release the lock. This writer waits for the readers at step (2)
to release the read lock and those readers wait for
pending_complete() (the kcopyd thread) to signal the s->cow_count
semaphore: DEADLOCK.
The above was thoroughly analyzed and documented by Nikos Tsironis as
part of his initial proposal for fixing this deadlock, see:
https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html
Fix this deadlock by reworking COW throttling so that it waits without
holding any locks. Add a variable 'in_progress' that counts how many
kcopyd jobs are running. A function wait_for_in_progress() will sleep if
'in_progress' is over the limit. It drops _origins_lock in order to
avoid the deadlock.
Reported-by: Guruswamy Basavaiah <guru2018@gmail.com>
Reported-by: Nikos Tsironis <ntsironis@arrikto.com>
Reviewed-by: Nikos Tsironis <ntsironis@arrikto.com>
Tested-by: Nikos Tsironis <ntsironis@arrikto.com>
Fixes: 721b1d98fb51 ("dm snapshot: Fix excessive memory usage and workqueue stalls")
Cc: stable@vger.kernel.org # v5.0+
Depends-on: 4a3f111a73a8c ("dm snapshot: introduce account_start_copy() and account_end_copy()")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-10-02 06:15:53 -04:00
|
|
|
return do_origin(o->dev, bio, true);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-07-27 15:08:00 +01:00
|
|
|
* Set the target "max_io_len" field to the minimum of all the snapshots'
|
2005-04-16 15:20:36 -07:00
|
|
|
* chunk sizes.
|
|
|
|
*/
|
|
|
|
static void origin_resume(struct dm_target *ti)
|
|
|
|
{
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o = ti->private;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2014-03-14 18:43:07 -04:00
|
|
|
o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
|
2015-02-26 11:40:35 -05:00
|
|
|
|
|
|
|
down_write(&_origins_lock);
|
|
|
|
__insert_dm_origin(o);
|
|
|
|
up_write(&_origins_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void origin_postsuspend(struct dm_target *ti)
|
|
|
|
{
|
|
|
|
struct dm_origin *o = ti->private;
|
|
|
|
|
|
|
|
down_write(&_origins_lock);
|
|
|
|
__remove_dm_origin(o);
|
|
|
|
up_write(&_origins_lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2013-03-01 22:45:44 +00:00
|
|
|
static void origin_status(struct dm_target *ti, status_type_t type,
|
|
|
|
unsigned status_flags, char *result, unsigned maxlen)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o = ti->private;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case STATUSTYPE_INFO:
|
|
|
|
result[0] = '\0';
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATUSTYPE_TABLE:
|
2014-03-14 18:42:12 -04:00
|
|
|
snprintf(result, maxlen, "%s", o->dev->name);
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-04 20:40:19 +01:00
|
|
|
static int origin_iterate_devices(struct dm_target *ti,
|
|
|
|
iterate_devices_callout_fn fn, void *data)
|
|
|
|
{
|
2014-03-14 18:42:12 -04:00
|
|
|
struct dm_origin *o = ti->private;
|
2009-09-04 20:40:19 +01:00
|
|
|
|
2014-03-14 18:42:12 -04:00
|
|
|
return fn(ti, o->dev, 0, ti->len, data);
|
2009-09-04 20:40:19 +01:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static struct target_type origin_target = {
|
|
|
|
.name = "snapshot-origin",
|
2015-02-26 11:40:35 -05:00
|
|
|
.version = {1, 9, 0},
|
2005-04-16 15:20:36 -07:00
|
|
|
.module = THIS_MODULE,
|
|
|
|
.ctr = origin_ctr,
|
|
|
|
.dtr = origin_dtr,
|
|
|
|
.map = origin_map,
|
|
|
|
.resume = origin_resume,
|
2015-02-26 11:40:35 -05:00
|
|
|
.postsuspend = origin_postsuspend,
|
2005-04-16 15:20:36 -07:00
|
|
|
.status = origin_status,
|
2009-09-04 20:40:19 +01:00
|
|
|
.iterate_devices = origin_iterate_devices,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct target_type snapshot_target = {
|
|
|
|
.name = "snapshot",
|
2019-06-19 17:05:54 -04:00
|
|
|
.version = {1, 16, 0},
|
2005-04-16 15:20:36 -07:00
|
|
|
.module = THIS_MODULE,
|
|
|
|
.ctr = snapshot_ctr,
|
|
|
|
.dtr = snapshot_dtr,
|
|
|
|
.map = snapshot_map,
|
2008-07-21 12:00:32 +01:00
|
|
|
.end_io = snapshot_end_io,
|
2009-12-10 23:52:24 +00:00
|
|
|
.preresume = snapshot_preresume,
|
2005-04-16 15:20:36 -07:00
|
|
|
.resume = snapshot_resume,
|
|
|
|
.status = snapshot_status,
|
2009-09-04 20:40:19 +01:00
|
|
|
.iterate_devices = snapshot_iterate_devices,
|
2019-06-19 17:05:54 -04:00
|
|
|
.io_hints = snapshot_io_hints,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
static struct target_type merge_target = {
|
|
|
|
.name = dm_snapshot_merge_target_name,
|
2019-06-19 17:05:54 -04:00
|
|
|
.version = {1, 5, 0},
|
2009-12-10 23:52:30 +00:00
|
|
|
.module = THIS_MODULE,
|
|
|
|
.ctr = snapshot_ctr,
|
|
|
|
.dtr = snapshot_dtr,
|
2009-12-10 23:52:31 +00:00
|
|
|
.map = snapshot_merge_map,
|
2009-12-10 23:52:30 +00:00
|
|
|
.end_io = snapshot_end_io,
|
2009-12-10 23:52:32 +00:00
|
|
|
.presuspend = snapshot_merge_presuspend,
|
2009-12-10 23:52:30 +00:00
|
|
|
.preresume = snapshot_preresume,
|
2009-12-10 23:52:32 +00:00
|
|
|
.resume = snapshot_merge_resume,
|
2009-12-10 23:52:30 +00:00
|
|
|
.status = snapshot_status,
|
|
|
|
.iterate_devices = snapshot_iterate_devices,
|
2019-06-19 17:05:54 -04:00
|
|
|
.io_hints = snapshot_io_hints,
|
2009-12-10 23:52:30 +00:00
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static int __init dm_snapshot_init(void)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
|
2009-01-06 03:05:17 +00:00
|
|
|
r = dm_exception_store_init();
|
|
|
|
if (r) {
|
|
|
|
DMERR("Failed to initialize exception stores");
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
r = init_origin_hash();
|
|
|
|
if (r) {
|
|
|
|
DMERR("init_origin_hash failed.");
|
2009-12-10 23:52:30 +00:00
|
|
|
goto bad_origin_hash;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2009-12-10 23:52:10 +00:00
|
|
|
exception_cache = KMEM_CACHE(dm_exception, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!exception_cache) {
|
|
|
|
DMERR("Couldn't create exception cache.");
|
|
|
|
r = -ENOMEM;
|
2009-12-10 23:52:30 +00:00
|
|
|
goto bad_exception_cache;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2007-07-12 17:26:32 +01:00
|
|
|
pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!pending_cache) {
|
|
|
|
DMERR("Couldn't create pending cache.");
|
|
|
|
r = -ENOMEM;
|
2009-12-10 23:52:30 +00:00
|
|
|
goto bad_pending_cache;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2017-11-25 01:43:50 +08:00
|
|
|
r = dm_register_target(&snapshot_target);
|
|
|
|
if (r < 0) {
|
|
|
|
DMERR("snapshot target register failed %d", r);
|
|
|
|
goto bad_register_snapshot_target;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = dm_register_target(&origin_target);
|
|
|
|
if (r < 0) {
|
|
|
|
DMERR("Origin target register failed %d", r);
|
|
|
|
goto bad_register_origin_target;
|
|
|
|
}
|
|
|
|
|
|
|
|
r = dm_register_target(&merge_target);
|
|
|
|
if (r < 0) {
|
|
|
|
DMERR("Merge target register failed %d", r);
|
|
|
|
goto bad_register_merge_target;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
|
2009-12-10 23:52:30 +00:00
|
|
|
bad_register_merge_target:
|
2005-04-16 15:20:36 -07:00
|
|
|
dm_unregister_target(&origin_target);
|
2009-12-10 23:52:30 +00:00
|
|
|
bad_register_origin_target:
|
2005-04-16 15:20:36 -07:00
|
|
|
dm_unregister_target(&snapshot_target);
|
2009-10-16 23:18:14 +01:00
|
|
|
bad_register_snapshot_target:
|
2017-11-25 01:43:50 +08:00
|
|
|
kmem_cache_destroy(pending_cache);
|
|
|
|
bad_pending_cache:
|
|
|
|
kmem_cache_destroy(exception_cache);
|
|
|
|
bad_exception_cache:
|
|
|
|
exit_origin_hash();
|
|
|
|
bad_origin_hash:
|
2009-10-16 23:18:14 +01:00
|
|
|
dm_exception_store_exit();
|
2009-12-10 23:52:30 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit dm_snapshot_exit(void)
|
|
|
|
{
|
2009-01-06 03:04:58 +00:00
|
|
|
dm_unregister_target(&snapshot_target);
|
|
|
|
dm_unregister_target(&origin_target);
|
2009-12-10 23:52:30 +00:00
|
|
|
dm_unregister_target(&merge_target);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
exit_origin_hash();
|
|
|
|
kmem_cache_destroy(pending_cache);
|
|
|
|
kmem_cache_destroy(exception_cache);
|
2009-01-06 03:05:17 +00:00
|
|
|
|
|
|
|
dm_exception_store_exit();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Module hooks */
|
|
|
|
module_init(dm_snapshot_init);
|
|
|
|
module_exit(dm_snapshot_exit);
|
|
|
|
|
|
|
|
MODULE_DESCRIPTION(DM_NAME " snapshot target");
|
|
|
|
MODULE_AUTHOR("Joe Thornber");
|
|
|
|
MODULE_LICENSE("GPL");
|
2013-03-01 22:45:47 +00:00
|
|
|
MODULE_ALIAS("dm-snapshot-origin");
|
|
|
|
MODULE_ALIAS("dm-snapshot-merge");
|