mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-04 04:04:19 +00:00
for-4.19/block-20180812
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAltwvasQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpv65EACTq5gSLnJBI6ZPr1RAHruVDnjfzO2Veitl tUtjm0XfWmnEiwQ3dYvnyhy99xbyaG3900d9BClCTlH6xaUdSiQkDpcKG/R2F36J 5mZitYukQcpFAQJWF8YKsTTE7JPl4VglCIDqYiC4+C3rOSVi8lrKn2qp4J4MMCFn thRg3jCcq7c5s9Eigsop1pXWQSasubkXfk55Krcp4oybKYpYRKXXf74Mj14QAbwJ QHN3VisyAUWoBRg7UQZo1Npe2oPk6bbnJypnjf8M0M2EnlvddEkIlHob91sodka8 6p4APOEu5cbyXOBCAQsw/koff14mb8aEadqeQA68WvXfIdX9ZjfxCX0OoC3sBEXk yqJhZ0C980AM13zIBD8ejv4uasGcPca8W+47mE5P8sRiI++5kBsFWDZPCtUBna0X 2Kh24NsmEya9XRR5vsB84dsIPQ3tLMkxg/IgQRVDaSnfJz0c/+zm54xDyKRaFT4l 5iERk2WSkm9+8jNfVmWG0edrv6nRAXjpGwFfOCPh6/LCSCi4xQRULYN7sVzsX8ZK FRjt24HftBI8mJbh4BtweJvg+ppVe1gAk3IO3HvxAQhv29Hz+uvFYe9kL+3N8LJA Qosr9n9O4+wKYizJcDnw+5iPqCHfAwOm9th4pyedR+R7SmNcP3yNC8AbbheNBiF5 Zolos5H+JA== =b9ib -----END PGP SIGNATURE----- Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block Pull block updates from Jens Axboe: "First pull request for this merge window, there will also be a followup request with some stragglers. This pull request contains: - Fix for a thundering heard issue in the wbt block code (Anchal Agarwal) - A few NVMe pull requests: * Improved tracepoints (Keith) * Larger inline data support for RDMA (Steve Wise) * RDMA setup/teardown fixes (Sagi) * Effects log suppor for NVMe target (Chaitanya Kulkarni) * Buffered IO suppor for NVMe target (Chaitanya Kulkarni) * TP4004 (ANA) support (Christoph) * Various NVMe fixes - Block io-latency controller support. Much needed support for properly containing block devices. (Josef) - Series improving how we handle sense information on the stack (Kees) - Lightnvm fixes and updates/improvements (Mathias/Javier et al) - Zoned device support for null_blk (Matias) - AIX partition fixes (Mauricio Faria de Oliveira) - DIF checksum code made generic (Max Gurtovoy) - Add support for discard in iostats (Michael Callahan / Tejun) - Set of updates for BFQ (Paolo) - Removal of async write support for bsg (Christoph) - Bio page dirtying and clone fixups (Christoph) - Set of bcache fix/changes (via Coly) - Series improving blk-mq queue setup/teardown speed (Ming) - Series improving merging performance on blk-mq (Ming) - Lots of other fixes and cleanups from a slew of folks" * tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits) blkcg: Make blkg_root_lookup() work for queues in bypass mode bcache: fix error setting writeback_rate through sysfs interface null_blk: add lock drop/acquire annotation Blk-throttle: reduce tail io latency when iops limit is enforced block: paride: pd: mark expected switch fall-throughs block: Ensure that a request queue is dissociated from the cgroup controller block: Introduce blk_exit_queue() blkcg: Introduce blkg_root_lookup() block: Remove two superfluous #include directives blk-mq: count the hctx as active before allocating tag block: bvec_nr_vecs() returns value for wrong slab bcache: trivial - remove tailing backslash in macro BTREE_FLAG bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section bcache: set max writeback rate when I/O request is idle bcache: add code comments for bset.c bcache: fix mistaken comments in request.c bcache: fix mistaken code comments in bcache.h bcache: add a comment in super.c bcache: avoid unncessary cache prefetch bch_btree_node_get() bcache: display rate debug parameters to 0 when writeback is not running ...
This commit is contained in:
commit
73ba2fb33c
@ -5,6 +5,7 @@ Description:
|
||||
The /proc/diskstats file displays the I/O statistics
|
||||
of block devices. Each line contains the following 14
|
||||
fields:
|
||||
|
||||
1 - major number
|
||||
2 - minor mumber
|
||||
3 - device name
|
||||
@ -19,4 +20,13 @@ Description:
|
||||
12 - I/Os currently in progress
|
||||
13 - time spent doing I/Os (ms)
|
||||
14 - weighted time spent doing I/Os (ms)
|
||||
|
||||
Kernel 4.18+ appends four more fields for discard
|
||||
tracking putting the total at 18:
|
||||
|
||||
15 - discards completed successfully
|
||||
16 - discards merged
|
||||
17 - sectors discarded
|
||||
18 - time spent discarding
|
||||
|
||||
For more details refer to Documentation/iostats.txt
|
||||
|
@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/.
|
||||
5-3. IO
|
||||
5-3-1. IO Interface Files
|
||||
5-3-2. Writeback
|
||||
5-3-3. IO Latency
|
||||
5-3-3-1. How IO Latency Throttling Works
|
||||
5-3-3-2. IO Latency Interface Files
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Device
|
||||
@ -1314,17 +1317,19 @@ IO Interface Files
|
||||
Lines are keyed by $MAJ:$MIN device numbers and not ordered.
|
||||
The following nested keys are defined.
|
||||
|
||||
====== ===================
|
||||
====== =====================
|
||||
rbytes Bytes read
|
||||
wbytes Bytes written
|
||||
rios Number of read IOs
|
||||
wios Number of write IOs
|
||||
====== ===================
|
||||
dbytes Bytes discarded
|
||||
dios Number of discard IOs
|
||||
====== =====================
|
||||
|
||||
An example read output follows:
|
||||
|
||||
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
|
||||
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
|
||||
8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
|
||||
8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
|
||||
|
||||
io.weight
|
||||
A read-write flat-keyed file which exists on non-root cgroups.
|
||||
@ -1446,6 +1451,85 @@ writeback as follows.
|
||||
vm.dirty[_background]_ratio.
|
||||
|
||||
|
||||
IO Latency
|
||||
~~~~~~~~~~
|
||||
|
||||
This is a cgroup v2 controller for IO workload protection. You provide a group
|
||||
with a latency target, and if the average latency exceeds that target the
|
||||
controller will throttle any peers that have a lower latency target than the
|
||||
protected workload.
|
||||
|
||||
The limits are only applied at the peer level in the hierarchy. This means that
|
||||
in the diagram below, only groups A, B, and C will influence each other, and
|
||||
groups D and F will influence each other. Group G will influence nobody.
|
||||
|
||||
[root]
|
||||
/ | \
|
||||
A B C
|
||||
/ \ |
|
||||
D F G
|
||||
|
||||
|
||||
So the ideal way to configure this is to set io.latency in groups A, B, and C.
|
||||
Generally you do not want to set a value lower than the latency your device
|
||||
supports. Experiment to find the value that works best for your workload.
|
||||
Start at higher than the expected latency for your device and watch the
|
||||
avg_lat value in io.stat for your workload group to get an idea of the
|
||||
latency you see during normal operation. Use the avg_lat value as a basis for
|
||||
your real setting, setting at 10-15% higher than the value in io.stat.
|
||||
|
||||
How IO Latency Throttling Works
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
io.latency is work conserving; so as long as everybody is meeting their latency
|
||||
target the controller doesn't do anything. Once a group starts missing its
|
||||
target it begins throttling any peer group that has a higher target than itself.
|
||||
This throttling takes 2 forms:
|
||||
|
||||
- Queue depth throttling. This is the number of outstanding IO's a group is
|
||||
allowed to have. We will clamp down relatively quickly, starting at no limit
|
||||
and going all the way down to 1 IO at a time.
|
||||
|
||||
- Artificial delay induction. There are certain types of IO that cannot be
|
||||
throttled without possibly adversely affecting higher priority groups. This
|
||||
includes swapping and metadata IO. These types of IO are allowed to occur
|
||||
normally, however they are "charged" to the originating group. If the
|
||||
originating group is being throttled you will see the use_delay and delay
|
||||
fields in io.stat increase. The delay value is how many microseconds that are
|
||||
being added to any process that runs in this group. Because this number can
|
||||
grow quite large if there is a lot of swapping or metadata IO occurring we
|
||||
limit the individual delay events to 1 second at a time.
|
||||
|
||||
Once the victimized group starts meeting its latency target again it will start
|
||||
unthrottling any peer groups that were throttled previously. If the victimized
|
||||
group simply stops doing IO the global counter will unthrottle appropriately.
|
||||
|
||||
IO Latency Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
io.latency
|
||||
This takes a similar format as the other controllers.
|
||||
|
||||
"MAJOR:MINOR target=<target time in microseconds"
|
||||
|
||||
io.stat
|
||||
If the controller is enabled you will see extra stats in io.stat in
|
||||
addition to the normal ones.
|
||||
|
||||
depth
|
||||
This is the current queue depth for the group.
|
||||
|
||||
avg_lat
|
||||
This is an exponential moving average with a decay rate of 1/exp
|
||||
bound by the sampling interval. The decay rate interval can be
|
||||
calculated by multiplying the win value in io.stat by the
|
||||
corresponding number of samples based on the win value.
|
||||
|
||||
win
|
||||
The sampling window size in milliseconds. This is the minimum
|
||||
duration of time between evaluation events. Windows only elapse
|
||||
with IO activity. Idle periods extend the most recent window.
|
||||
|
||||
PID
|
||||
---
|
||||
|
||||
|
@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
|
||||
0: Tag set is not shared.
|
||||
1: Tag set shared between devices for blk-mq. Only makes sense with
|
||||
nr_devices > 1, otherwise there's no tag set to share.
|
||||
|
||||
zoned=[0/1]: Default: 0
|
||||
0: Block device is exposed as a random-access block device.
|
||||
1: Block device is exposed as a host-managed zoned block device.
|
||||
|
||||
zone_size=[MB]: Default: 256
|
||||
Per zone size when exposed as a zoned block device. Must be a power of two.
|
||||
|
@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests
|
||||
in_flight requests number of I/Os currently in flight
|
||||
io_ticks milliseconds total time this block device has been active
|
||||
time_in_queue milliseconds total wait time for all requests
|
||||
discard I/Os requests number of discard I/Os processed
|
||||
discard merges requests number of discard I/Os merged with in-queue I/O
|
||||
discard sectors sectors number of sectors discarded
|
||||
discard ticks milliseconds total wait time for discard requests
|
||||
|
||||
read I/Os, write I/Os
|
||||
=====================
|
||||
read I/Os, write I/Os, discard I/0s
|
||||
===================================
|
||||
|
||||
These values increment when an I/O request completes.
|
||||
|
||||
read merges, write merges
|
||||
=========================
|
||||
read merges, write merges, discard merges
|
||||
=========================================
|
||||
|
||||
These values increment when an I/O request is merged with an
|
||||
already-queued I/O request.
|
||||
|
||||
read sectors, write sectors
|
||||
===========================
|
||||
read sectors, write sectors, discard_sectors
|
||||
============================================
|
||||
|
||||
These values count the number of sectors read from or written to this
|
||||
block device. The "sectors" in question are the standard UNIX 512-byte
|
||||
sectors, not any device- or filesystem-specific block size. The
|
||||
counters are incremented when the I/O completes.
|
||||
These values count the number of sectors read from, written to, or
|
||||
discarded from this block device. The "sectors" in question are the
|
||||
standard UNIX 512-byte sectors, not any device- or filesystem-specific
|
||||
block size. The counters are incremented when the I/O completes.
|
||||
|
||||
read ticks, write ticks
|
||||
=======================
|
||||
read ticks, write ticks, discard ticks
|
||||
======================================
|
||||
|
||||
These values count the number of milliseconds that I/O requests have
|
||||
waited on this block device. If there are multiple I/O requests waiting,
|
||||
|
@ -31,6 +31,9 @@ Here are examples of these different formats::
|
||||
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
|
||||
3 1 hda1 35486 38030 38030 38030
|
||||
|
||||
4.18+ diskstats:
|
||||
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
|
||||
|
||||
On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
|
||||
a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
|
||||
|
||||
@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
|
||||
last update of this field. This can provide an easy measure of both
|
||||
I/O completion time and the backlog that may be accumulating.
|
||||
|
||||
Field 12 -- # of discards completed
|
||||
This is the total number of discards completed successfully.
|
||||
|
||||
Field 13 -- # of discards merged
|
||||
See the description of field 2
|
||||
|
||||
Field 14 -- # of sectors discarded
|
||||
This is the total number of sectors discarded successfully.
|
||||
|
||||
Field 15 -- # of milliseconds spent discarding
|
||||
This is the total number of milliseconds spent by all discards (as
|
||||
measured from __make_request() to end_that_request_last()).
|
||||
|
||||
To avoid introducing performance bottlenecks, no locks are held while
|
||||
modifying these counters. This implies that minor inaccuracies may be
|
||||
|
@ -149,6 +149,18 @@ config BLK_WBT
|
||||
dynamically on an algorithm loosely based on CoDel, factoring in
|
||||
the realtime performance of the disk.
|
||||
|
||||
config BLK_CGROUP_IOLATENCY
|
||||
bool "Enable support for latency based cgroup IO protection"
|
||||
depends on BLK_CGROUP=y
|
||||
default n
|
||||
---help---
|
||||
Enabling this option enables the .latency interface for IO throttling.
|
||||
The IO controller will attempt to maintain average IO latencies below
|
||||
the configured latency target, throttling anybody with a higher latency
|
||||
target than the victimized group.
|
||||
|
||||
Note, this is an experimental interface and could be changed someday.
|
||||
|
||||
config BLK_WBT_SQ
|
||||
bool "Single queue writeback throttling"
|
||||
default n
|
||||
@ -177,6 +189,10 @@ config BLK_DEBUG_FS
|
||||
Unless you are building a kernel for a tiny system, you should
|
||||
say Y here.
|
||||
|
||||
config BLK_DEBUG_FS_ZONED
|
||||
bool
|
||||
default BLK_DEBUG_FS && BLK_DEV_ZONED
|
||||
|
||||
config BLK_SED_OPAL
|
||||
bool "Logic for interfacing with Opal enabled SEDs"
|
||||
---help---
|
||||
|
@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
|
||||
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
|
||||
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
|
||||
genhd.o partition-generic.o ioprio.o \
|
||||
badblocks.o partitions/
|
||||
badblocks.o partitions/ blk-rq-qos.o
|
||||
|
||||
obj-$(CONFIG_BOUNCE) += bounce.o
|
||||
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
|
||||
@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
|
||||
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
|
||||
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
|
||||
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
|
||||
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
|
||||
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
|
||||
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
|
||||
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
|
||||
@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
|
||||
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
|
||||
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
|
||||
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
|
||||
|
@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
|
||||
* The following function returns true if every queue must receive the
|
||||
* same share of the throughput (this condition is used when deciding
|
||||
* whether idling may be disabled, see the comments in the function
|
||||
* bfq_bfqq_may_idle()).
|
||||
* bfq_better_to_idle()).
|
||||
*
|
||||
* Such a scenario occurs when:
|
||||
* 1) all active queues have the same weight,
|
||||
@ -742,7 +742,8 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
* See the comments to the function bfq_weights_tree_add() for considerations
|
||||
* about overhead.
|
||||
*/
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_entity *entity,
|
||||
struct rb_root *root)
|
||||
{
|
||||
if (!entity->weight_counter)
|
||||
@ -759,6 +760,43 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
entity->weight_counter = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoke __bfq_weights_tree_remove on bfqq and all its inactive
|
||||
* parent entities.
|
||||
*/
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = bfqq->entity.parent;
|
||||
|
||||
__bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
||||
&bfqd->queue_weights_tree);
|
||||
|
||||
for_each_entity(entity) {
|
||||
struct bfq_sched_data *sd = entity->my_sched_data;
|
||||
|
||||
if (sd->next_in_service || sd->in_service_entity) {
|
||||
/*
|
||||
* entity is still active, because either
|
||||
* next_in_service or in_service_entity is not
|
||||
* NULL (see the comments on the definition of
|
||||
* next_in_service for details on why
|
||||
* in_service_entity must be checked too).
|
||||
*
|
||||
* As a consequence, the weight of entity is
|
||||
* not to be removed. In addition, if entity
|
||||
* is active, then its parent entities are
|
||||
* active as well, and thus their weights are
|
||||
* not to be removed either. In the end, this
|
||||
* loop must stop here.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
__bfq_weights_tree_remove(bfqd, entity,
|
||||
&bfqd->group_weights_tree);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return expired entry, or NULL to just start from scratch in rbtree.
|
||||
*/
|
||||
@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
|
||||
* remain unchanged after such an expiration, and the
|
||||
* following statement therefore assigns to
|
||||
* entity->budget the remaining budget on such an
|
||||
* expiration. For clarity, entity->service is not
|
||||
* updated on expiration in any case, and, in normal
|
||||
* operation, is reset only when bfqq is selected for
|
||||
* service (see bfq_get_next_queue).
|
||||
* expiration.
|
||||
*/
|
||||
entity->budget = min_t(unsigned long,
|
||||
bfq_bfqq_budget_left(bfqq),
|
||||
bfqq->max_budget);
|
||||
|
||||
/*
|
||||
* At this point, we have used entity->service to get
|
||||
* the budget left (needed for updating
|
||||
* entity->budget). Thus we finally can, and have to,
|
||||
* reset entity->service. The latter must be reset
|
||||
* because bfqq would otherwise be charged again for
|
||||
* the service it has received during its previous
|
||||
* service slot(s).
|
||||
*/
|
||||
entity->service = 0;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can finally complete expiration, by setting service to 0.
|
||||
*/
|
||||
entity->service = 0;
|
||||
entity->budget = max_t(unsigned long, bfqq->max_budget,
|
||||
bfq_serv_to_charge(bfqq->next_rq, bfqq));
|
||||
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
|
||||
@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
|
||||
ref = bfqq->ref;
|
||||
__bfq_bfqq_expire(bfqd, bfqq);
|
||||
|
||||
if (ref == 1) /* bfqq is gone, no more actions on it */
|
||||
return;
|
||||
|
||||
/* mark bfqq as waiting a request only if a bic still points to it */
|
||||
if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
|
||||
if (!bfq_bfqq_busy(bfqq) &&
|
||||
reason != BFQQE_BUDGET_TIMEOUT &&
|
||||
reason != BFQQE_BUDGET_EXHAUSTED)
|
||||
reason != BFQQE_BUDGET_EXHAUSTED) {
|
||||
bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
|
||||
/*
|
||||
* Not setting service to 0, because, if the next rq
|
||||
* arrives in time, the queue will go on receiving
|
||||
* service with this same budget (as if it never expired)
|
||||
*/
|
||||
} else
|
||||
entity->service = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
|
||||
* issues taken into account are not trivial. We discuss these issues
|
||||
* individually while introducing the variables.
|
||||
*/
|
||||
static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
|
||||
static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_data *bfqd = bfqq->bfqd;
|
||||
bool rot_without_queueing =
|
||||
@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the in-service queue is empty but the function bfq_bfqq_may_idle
|
||||
* If the in-service queue is empty but the function bfq_better_to_idle
|
||||
* returns true, then:
|
||||
* 1) the queue must remain in service and cannot be expired, and
|
||||
* 2) the device must be idled to wait for the possible arrival of a new
|
||||
* request for the queue.
|
||||
* See the comments on the function bfq_bfqq_may_idle for the reasons
|
||||
* See the comments on the function bfq_better_to_idle for the reasons
|
||||
* why performing device idling is the best choice to boost the throughput
|
||||
* and preserve service guarantees when bfq_bfqq_may_idle itself
|
||||
* and preserve service guarantees when bfq_better_to_idle itself
|
||||
* returns true.
|
||||
*/
|
||||
static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
|
||||
{
|
||||
return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
|
||||
return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
|
||||
|
||||
bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
|
||||
|
||||
/*
|
||||
* Do not expire bfqq for budget timeout if bfqq may be about
|
||||
* to enjoy device idling. The reason why, in this case, we
|
||||
* prevent bfqq from expiring is the same as in the comments
|
||||
* on the case where bfq_bfqq_must_idle() returns true, in
|
||||
* bfq_completed_request().
|
||||
*/
|
||||
if (bfq_may_expire_for_budg_timeout(bfqq) &&
|
||||
!bfq_bfqq_wait_request(bfqq) &&
|
||||
!bfq_bfqq_must_idle(bfqq))
|
||||
goto expire;
|
||||
|
||||
@ -3620,7 +3686,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
|
||||
* may idle after their completion, then keep it anyway.
|
||||
*/
|
||||
if (bfq_bfqq_wait_request(bfqq) ||
|
||||
(bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
|
||||
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
|
||||
bfqq = NULL;
|
||||
goto keep_queue;
|
||||
}
|
||||
@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
*/
|
||||
bfqq->budget_timeout = jiffies;
|
||||
|
||||
bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
||||
&bfqd->queue_weights_tree);
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
}
|
||||
|
||||
now_ns = ktime_get_ns();
|
||||
@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
* or if we want to idle in case it has no pending requests.
|
||||
*/
|
||||
if (bfqd->in_service_queue == bfqq) {
|
||||
if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
|
||||
if (bfq_bfqq_must_idle(bfqq)) {
|
||||
if (bfqq->dispatched == 0)
|
||||
bfq_arm_slice_timer(bfqd);
|
||||
/*
|
||||
* If we get here, we do not expire bfqq, even
|
||||
* if bfqq was in budget timeout or had no
|
||||
* more requests (as controlled in the next
|
||||
* conditional instructions). The reason for
|
||||
* not expiring bfqq is as follows.
|
||||
*
|
||||
* Here bfqq->dispatched > 0 holds, but
|
||||
* bfq_bfqq_must_idle() returned true. This
|
||||
* implies that, even if no request arrives
|
||||
* for bfqq before bfqq->dispatched reaches 0,
|
||||
* bfqq will, however, not be expired on the
|
||||
* completion event that causes bfqq->dispatch
|
||||
* to reach zero. In contrast, on this event,
|
||||
* bfqq will start enjoying device idling
|
||||
* (I/O-dispatch plugging).
|
||||
*
|
||||
* But, if we expired bfqq here, bfqq would
|
||||
* not have the chance to enjoy device idling
|
||||
* when bfqq->dispatched finally reaches
|
||||
* zero. This would expose bfqq to violation
|
||||
* of its reserved service guarantees.
|
||||
*/
|
||||
return;
|
||||
} else if (bfq_may_expire_for_budg_timeout(bfqq))
|
||||
bfq_bfqq_expire(bfqd, bfqq, false,
|
||||
BFQQE_BUDGET_TIMEOUT);
|
||||
else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
|
||||
(bfqq->dispatched == 0 ||
|
||||
!bfq_bfqq_may_idle(bfqq)))
|
||||
!bfq_better_to_idle(bfqq)))
|
||||
bfq_bfqq_expire(bfqd, bfqq, false,
|
||||
BFQQE_NO_MORE_REQUESTS);
|
||||
}
|
||||
|
@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
|
||||
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
struct rb_root *root);
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_entity *entity,
|
||||
struct rb_root *root);
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq);
|
||||
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bool compensate, enum bfqq_expiration reason);
|
||||
void bfq_put_queue(struct bfq_queue *bfqq);
|
||||
|
@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
|
||||
if (bfqq)
|
||||
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
else /* bfq_group */
|
||||
bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
|
||||
|
||||
if (bfqg != bfqd->root_group)
|
||||
bfqg->active_entities++;
|
||||
#endif
|
||||
@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
|
||||
if (bfqq)
|
||||
list_del(&bfqq->bfqq_list);
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
else /* bfq_group */
|
||||
bfq_weights_tree_remove(bfqd, entity,
|
||||
&bfqd->group_weights_tree);
|
||||
|
||||
if (bfqg != bfqd->root_group)
|
||||
bfqg->active_entities--;
|
||||
#endif
|
||||
@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
||||
if (prev_weight != new_weight) {
|
||||
root = bfqq ? &bfqd->queue_weights_tree :
|
||||
&bfqd->group_weights_tree;
|
||||
bfq_weights_tree_remove(bfqd, entity, root);
|
||||
__bfq_weights_tree_remove(bfqd, entity, root);
|
||||
}
|
||||
entity->weight = new_weight;
|
||||
/*
|
||||
@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
|
||||
* one of its children receives a new request.
|
||||
*
|
||||
* Basically, this function updates the timestamps of entity and
|
||||
* inserts entity into its active tree, ater possibly extracting it
|
||||
* inserts entity into its active tree, after possibly extracting it
|
||||
* from its idle tree.
|
||||
*/
|
||||
static void __bfq_activate_entity(struct bfq_entity *entity,
|
||||
@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
|
||||
entity->on_st = true;
|
||||
}
|
||||
|
||||
#ifdef BFQ_GROUP_IOSCHED_ENABLED
|
||||
if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
|
||||
struct bfq_group *bfqg =
|
||||
container_of(entity, struct bfq_group, entity);
|
||||
|
||||
bfq_weights_tree_add(bfqg->bfqd, entity,
|
||||
&bfqd->group_weights_tree);
|
||||
}
|
||||
#endif
|
||||
|
||||
bfq_update_fin_time_enqueue(entity, st, backshifted);
|
||||
}
|
||||
|
||||
@ -1541,12 +1544,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
|
||||
entity = sd->next_in_service;
|
||||
sd->in_service_entity = entity;
|
||||
|
||||
/*
|
||||
* Reset the accumulator of the amount of service that
|
||||
* the entity is about to receive.
|
||||
*/
|
||||
entity->service = 0;
|
||||
|
||||
/*
|
||||
* If entity is no longer a candidate for next
|
||||
* service, then it must be extracted from its active
|
||||
@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfqd->busy_queues--;
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
||||
&bfqd->queue_weights_tree);
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
|
||||
if (bfqq->wr_coeff > 1)
|
||||
bfqd->wr_busy_queues--;
|
||||
|
@ -159,28 +159,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL(bio_integrity_add_page);
|
||||
|
||||
/**
|
||||
* bio_integrity_intervals - Return number of integrity intervals for a bio
|
||||
* @bi: blk_integrity profile for device
|
||||
* @sectors: Size of the bio in 512-byte sectors
|
||||
*
|
||||
* Description: The block layer calculates everything in 512 byte
|
||||
* sectors but integrity metadata is done in terms of the data integrity
|
||||
* interval size of the storage device. Convert the block layer sectors
|
||||
* to the appropriate number of integrity intervals.
|
||||
*/
|
||||
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
|
||||
unsigned int sectors)
|
||||
{
|
||||
return sectors >> (bi->interval_exp - 9);
|
||||
}
|
||||
|
||||
static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
|
||||
unsigned int sectors)
|
||||
{
|
||||
return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_integrity_process - Process integrity metadata for a bio
|
||||
* @bio: bio to generate/verify integrity metadata for
|
||||
|
198
block/bio.c
198
block/bio.c
@ -28,9 +28,11 @@
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
#include "blk.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
/*
|
||||
* Test patch to inline a certain number of bi_io_vec's inside the bio
|
||||
@ -156,7 +158,7 @@ static void bio_put_slab(struct bio_set *bs)
|
||||
|
||||
unsigned int bvec_nr_vecs(unsigned short idx)
|
||||
{
|
||||
return bvec_slabs[idx].nr_vecs;
|
||||
return bvec_slabs[--idx].nr_vecs;
|
||||
}
|
||||
|
||||
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
|
||||
@ -644,83 +646,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
|
||||
}
|
||||
EXPORT_SYMBOL(bio_clone_fast);
|
||||
|
||||
/**
|
||||
* bio_clone_bioset - clone a bio
|
||||
* @bio_src: bio to clone
|
||||
* @gfp_mask: allocation priority
|
||||
* @bs: bio_set to allocate from
|
||||
*
|
||||
* Clone bio. Caller will own the returned bio, but not the actual data it
|
||||
* points to. Reference count of returned bio will be one.
|
||||
*/
|
||||
struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
|
||||
struct bio_set *bs)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
struct bio *bio;
|
||||
|
||||
/*
|
||||
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
|
||||
* bio_src->bi_io_vec to bio->bi_io_vec.
|
||||
*
|
||||
* We can't do that anymore, because:
|
||||
*
|
||||
* - The point of cloning the biovec is to produce a bio with a biovec
|
||||
* the caller can modify: bi_idx and bi_bvec_done should be 0.
|
||||
*
|
||||
* - The original bio could've had more than BIO_MAX_PAGES biovecs; if
|
||||
* we tried to clone the whole thing bio_alloc_bioset() would fail.
|
||||
* But the clone should succeed as long as the number of biovecs we
|
||||
* actually need to allocate is fewer than BIO_MAX_PAGES.
|
||||
*
|
||||
* - Lastly, bi_vcnt should not be looked at or relied upon by code
|
||||
* that does not own the bio - reason being drivers don't use it for
|
||||
* iterating over the biovec anymore, so expecting it to be kept up
|
||||
* to date (i.e. for clones that share the parent biovec) is just
|
||||
* asking for trouble and would force extra work on
|
||||
* __bio_clone_fast() anyways.
|
||||
*/
|
||||
|
||||
bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
|
||||
if (!bio)
|
||||
return NULL;
|
||||
bio->bi_disk = bio_src->bi_disk;
|
||||
bio->bi_opf = bio_src->bi_opf;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
break;
|
||||
case REQ_OP_WRITE_SAME:
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
|
||||
break;
|
||||
default:
|
||||
bio_for_each_segment(bv, bio_src, iter)
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bv;
|
||||
break;
|
||||
}
|
||||
|
||||
if (bio_integrity(bio_src)) {
|
||||
int ret;
|
||||
|
||||
ret = bio_integrity_clone(bio, bio_src, gfp_mask);
|
||||
if (ret < 0) {
|
||||
bio_put(bio);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bio_clone_blkcg_association(bio, bio_src);
|
||||
|
||||
return bio;
|
||||
}
|
||||
EXPORT_SYMBOL(bio_clone_bioset);
|
||||
|
||||
/**
|
||||
* bio_add_pc_page - attempt to add page to bio
|
||||
* @q: the target queue
|
||||
@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio)
|
||||
int i;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
if (page && !PageCompound(page))
|
||||
set_page_dirty_lock(page);
|
||||
if (!PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
|
||||
@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio)
|
||||
struct bio_vec *bvec;
|
||||
int i;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
if (page)
|
||||
put_page(page);
|
||||
}
|
||||
bio_for_each_segment_all(bvec, bio, i)
|
||||
put_page(bvec->bv_page);
|
||||
}
|
||||
|
||||
/*
|
||||
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
|
||||
* If they are, then fine. If, however, some pages are clean then they must
|
||||
* have been written out during the direct-IO read. So we take another ref on
|
||||
* the BIO and the offending pages and re-dirty the pages in process context.
|
||||
* the BIO and re-dirty the pages in process context.
|
||||
*
|
||||
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
|
||||
* here on. It will run one put_page() against each page and will run one
|
||||
@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list;
|
||||
*/
|
||||
static void bio_dirty_fn(struct work_struct *work)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct bio *bio;
|
||||
struct bio *bio, *next;
|
||||
|
||||
spin_lock_irqsave(&bio_dirty_lock, flags);
|
||||
bio = bio_dirty_list;
|
||||
spin_lock_irq(&bio_dirty_lock);
|
||||
next = bio_dirty_list;
|
||||
bio_dirty_list = NULL;
|
||||
spin_unlock_irqrestore(&bio_dirty_lock, flags);
|
||||
spin_unlock_irq(&bio_dirty_lock);
|
||||
|
||||
while (bio) {
|
||||
struct bio *next = bio->bi_private;
|
||||
while ((bio = next) != NULL) {
|
||||
next = bio->bi_private;
|
||||
|
||||
bio_set_pages_dirty(bio);
|
||||
bio_release_pages(bio);
|
||||
bio_put(bio);
|
||||
bio = next;
|
||||
}
|
||||
}
|
||||
|
||||
void bio_check_pages_dirty(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
int nr_clean_pages = 0;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, i) {
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
if (PageDirty(page) || PageCompound(page)) {
|
||||
put_page(page);
|
||||
bvec->bv_page = NULL;
|
||||
} else {
|
||||
nr_clean_pages++;
|
||||
}
|
||||
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
|
||||
goto defer;
|
||||
}
|
||||
|
||||
if (nr_clean_pages) {
|
||||
unsigned long flags;
|
||||
|
||||
bio_release_pages(bio);
|
||||
bio_put(bio);
|
||||
return;
|
||||
defer:
|
||||
spin_lock_irqsave(&bio_dirty_lock, flags);
|
||||
bio->bi_private = bio_dirty_list;
|
||||
bio_dirty_list = bio;
|
||||
spin_unlock_irqrestore(&bio_dirty_lock, flags);
|
||||
schedule_work(&bio_dirty_work);
|
||||
} else {
|
||||
bio_put(bio);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
|
||||
|
||||
void generic_start_io_acct(struct request_queue *q, int rw,
|
||||
void generic_start_io_acct(struct request_queue *q, int op,
|
||||
unsigned long sectors, struct hd_struct *part)
|
||||
{
|
||||
const int sgrp = op_stat_group(op);
|
||||
int cpu = part_stat_lock();
|
||||
|
||||
part_round_stats(q, cpu, part);
|
||||
part_stat_inc(cpu, part, ios[rw]);
|
||||
part_stat_add(cpu, part, sectors[rw], sectors);
|
||||
part_inc_in_flight(q, part, rw);
|
||||
part_stat_inc(cpu, part, ios[sgrp]);
|
||||
part_stat_add(cpu, part, sectors[sgrp], sectors);
|
||||
part_inc_in_flight(q, part, op_is_write(op));
|
||||
|
||||
part_stat_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(generic_start_io_acct);
|
||||
|
||||
void generic_end_io_acct(struct request_queue *q, int rw,
|
||||
void generic_end_io_acct(struct request_queue *q, int req_op,
|
||||
struct hd_struct *part, unsigned long start_time)
|
||||
{
|
||||
unsigned long duration = jiffies - start_time;
|
||||
const int sgrp = op_stat_group(req_op);
|
||||
int cpu = part_stat_lock();
|
||||
|
||||
part_stat_add(cpu, part, ticks[rw], duration);
|
||||
part_stat_add(cpu, part, ticks[sgrp], duration);
|
||||
part_round_stats(q, cpu, part);
|
||||
part_dec_in_flight(q, part, rw);
|
||||
part_dec_in_flight(q, part, op_is_write(req_op));
|
||||
|
||||
part_stat_unlock();
|
||||
}
|
||||
@ -1834,6 +1745,9 @@ void bio_endio(struct bio *bio)
|
||||
if (!bio_integrity_endio(bio))
|
||||
return;
|
||||
|
||||
if (bio->bi_disk)
|
||||
rq_qos_done_bio(bio->bi_disk->queue, bio);
|
||||
|
||||
/*
|
||||
* Need to have a real endio function for chained bios, otherwise
|
||||
* various corner cases will break (like stacking block devices that
|
||||
@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
/**
|
||||
* bio_associate_blkcg_from_page - associate a bio with the page's blkcg
|
||||
* @bio: target bio
|
||||
* @page: the page to lookup the blkcg from
|
||||
*
|
||||
* Associate @bio with the blkcg from @page's owning memcg. This works like
|
||||
* every other associate function wrt references.
|
||||
*/
|
||||
int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
|
||||
{
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
|
||||
if (unlikely(bio->bi_css))
|
||||
return -EBUSY;
|
||||
if (!page->mem_cgroup)
|
||||
return 0;
|
||||
blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
|
||||
&io_cgrp_subsys);
|
||||
bio->bi_css = blkcg_css;
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/**
|
||||
* bio_associate_blkcg - associate a bio with the specified blkcg
|
||||
* @bio: target bio
|
||||
@ -2064,6 +2002,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
|
||||
|
||||
/**
|
||||
* bio_associate_blkg - associate a bio with the specified blkg
|
||||
* @bio: target bio
|
||||
* @blkg: the blkg to associate
|
||||
*
|
||||
* Associate @bio with the blkg specified by @blkg. This is the queue specific
|
||||
* blkcg information associated with the @bio, a reference will be taken on the
|
||||
* @blkg and will be freed when the bio is freed.
|
||||
*/
|
||||
int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
|
||||
{
|
||||
if (unlikely(bio->bi_blkg))
|
||||
return -EBUSY;
|
||||
blkg_get(blkg);
|
||||
bio->bi_blkg = blkg;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_disassociate_task - undo bio_associate_current()
|
||||
* @bio: target bio
|
||||
@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio)
|
||||
css_put(bio->bi_css);
|
||||
bio->bi_css = NULL;
|
||||
}
|
||||
if (bio->bi_blkg) {
|
||||
blkg_put(bio->bi_blkg);
|
||||
bio->bi_blkg = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/tracehook.h>
|
||||
#include "blk.h"
|
||||
|
||||
#define MAX_KEY_LEN 100
|
||||
@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
|
||||
|
||||
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
|
||||
|
||||
static bool blkcg_debug_stats = false;
|
||||
|
||||
static bool blkcg_policy_enabled(struct request_queue *q,
|
||||
const struct blkcg_policy *pol)
|
||||
{
|
||||
@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
|
||||
[BLKG_RWSTAT_WRITE] = "Write",
|
||||
[BLKG_RWSTAT_SYNC] = "Sync",
|
||||
[BLKG_RWSTAT_ASYNC] = "Async",
|
||||
[BLKG_RWSTAT_DISCARD] = "Discard",
|
||||
};
|
||||
const char *dname = blkg_dev_name(pd->blkg);
|
||||
u64 v;
|
||||
@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
|
||||
(unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
|
||||
|
||||
v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
|
||||
atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
|
||||
atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
|
||||
atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
|
||||
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
|
||||
return v;
|
||||
}
|
||||
@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
|
||||
|
||||
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
|
||||
const char *dname;
|
||||
char *buf;
|
||||
struct blkg_rwstat rwstat;
|
||||
u64 rbytes, wbytes, rios, wios;
|
||||
u64 rbytes, wbytes, rios, wios, dbytes, dios;
|
||||
size_t size = seq_get_buf(sf, &buf), off = 0;
|
||||
int i;
|
||||
bool has_stats = false;
|
||||
|
||||
dname = blkg_dev_name(blkg);
|
||||
if (!dname)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Hooray string manipulation, count is the size written NOT
|
||||
* INCLUDING THE \0, so size is now count+1 less than what we
|
||||
* had before, but we want to start writing the next bit from
|
||||
* the \0 so we only add count to buf.
|
||||
*/
|
||||
off += scnprintf(buf+off, size-off, "%s ", dname);
|
||||
|
||||
spin_lock_irq(blkg->q->queue_lock);
|
||||
|
||||
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
|
||||
offsetof(struct blkcg_gq, stat_bytes));
|
||||
rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
|
||||
wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
|
||||
dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
|
||||
|
||||
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
|
||||
offsetof(struct blkcg_gq, stat_ios));
|
||||
rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
|
||||
wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
|
||||
dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
|
||||
|
||||
spin_unlock_irq(blkg->q->queue_lock);
|
||||
|
||||
if (rbytes || wbytes || rios || wios)
|
||||
seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
|
||||
dname, rbytes, wbytes, rios, wios);
|
||||
if (rbytes || wbytes || rios || wios) {
|
||||
has_stats = true;
|
||||
off += scnprintf(buf+off, size-off,
|
||||
"rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
|
||||
rbytes, wbytes, rios, wios,
|
||||
dbytes, dios);
|
||||
}
|
||||
|
||||
if (!blkcg_debug_stats)
|
||||
goto next;
|
||||
|
||||
if (atomic_read(&blkg->use_delay)) {
|
||||
has_stats = true;
|
||||
off += scnprintf(buf+off, size-off,
|
||||
" use_delay=%d delay_nsec=%llu",
|
||||
atomic_read(&blkg->use_delay),
|
||||
(unsigned long long)atomic64_read(&blkg->delay_nsec));
|
||||
}
|
||||
|
||||
for (i = 0; i < BLKCG_MAX_POLS; i++) {
|
||||
struct blkcg_policy *pol = blkcg_policy[i];
|
||||
size_t written;
|
||||
|
||||
if (!blkg->pd[i] || !pol->pd_stat_fn)
|
||||
continue;
|
||||
|
||||
written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
|
||||
if (written)
|
||||
has_stats = true;
|
||||
off += written;
|
||||
}
|
||||
next:
|
||||
if (has_stats) {
|
||||
off += scnprintf(buf+off, size-off, "\n");
|
||||
seq_commit(sf, off);
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
|
||||
if (preloaded)
|
||||
radix_tree_preload_end();
|
||||
|
||||
ret = blk_iolatency_init(q);
|
||||
if (ret) {
|
||||
spin_lock_irq(q->queue_lock);
|
||||
blkg_destroy_all(q);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = blk_throtl_init(q);
|
||||
if (ret) {
|
||||
spin_lock_irq(q->queue_lock);
|
||||
@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
|
||||
mutex_unlock(&blkcg_pol_mutex);
|
||||
}
|
||||
|
||||
static void blkcg_exit(struct task_struct *tsk)
|
||||
{
|
||||
if (tsk->throttle_queue)
|
||||
blk_put_queue(tsk->throttle_queue);
|
||||
tsk->throttle_queue = NULL;
|
||||
}
|
||||
|
||||
struct cgroup_subsys io_cgrp_subsys = {
|
||||
.css_alloc = blkcg_css_alloc,
|
||||
.css_offline = blkcg_css_offline,
|
||||
@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
|
||||
.dfl_cftypes = blkcg_files,
|
||||
.legacy_cftypes = blkcg_legacy_files,
|
||||
.legacy_name = "blkio",
|
||||
.exit = blkcg_exit,
|
||||
#ifdef CONFIG_MEMCG
|
||||
/*
|
||||
* This ensures that, if available, memcg is automatically enabled
|
||||
@ -1547,3 +1615,209 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
|
||||
mutex_unlock(&blkcg_pol_register_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
|
||||
|
||||
/*
|
||||
* Scale the accumulated delay based on how long it has been since we updated
|
||||
* the delay. We only call this when we are adding delay, in case it's been a
|
||||
* while since we added delay, and when we are checking to see if we need to
|
||||
* delay a task, to account for any delays that may have occurred.
|
||||
*/
|
||||
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
|
||||
{
|
||||
u64 old = atomic64_read(&blkg->delay_start);
|
||||
|
||||
/*
|
||||
* We only want to scale down every second. The idea here is that we
|
||||
* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
|
||||
* time window. We only want to throttle tasks for recent delay that
|
||||
* has occurred, in 1 second time windows since that's the maximum
|
||||
* things can be throttled. We save the current delay window in
|
||||
* blkg->last_delay so we know what amount is still left to be charged
|
||||
* to the blkg from this point onward. blkg->last_use keeps track of
|
||||
* the use_delay counter. The idea is if we're unthrottling the blkg we
|
||||
* are ok with whatever is happening now, and we can take away more of
|
||||
* the accumulated delay as we've already throttled enough that
|
||||
* everybody is happy with their IO latencies.
|
||||
*/
|
||||
if (time_before64(old + NSEC_PER_SEC, now) &&
|
||||
atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
|
||||
u64 cur = atomic64_read(&blkg->delay_nsec);
|
||||
u64 sub = min_t(u64, blkg->last_delay, now - old);
|
||||
int cur_use = atomic_read(&blkg->use_delay);
|
||||
|
||||
/*
|
||||
* We've been unthrottled, subtract a larger chunk of our
|
||||
* accumulated delay.
|
||||
*/
|
||||
if (cur_use < blkg->last_use)
|
||||
sub = max_t(u64, sub, blkg->last_delay >> 1);
|
||||
|
||||
/*
|
||||
* This shouldn't happen, but handle it anyway. Our delay_nsec
|
||||
* should only ever be growing except here where we subtract out
|
||||
* min(last_delay, 1 second), but lord knows bugs happen and I'd
|
||||
* rather not end up with negative numbers.
|
||||
*/
|
||||
if (unlikely(cur < sub)) {
|
||||
atomic64_set(&blkg->delay_nsec, 0);
|
||||
blkg->last_delay = 0;
|
||||
} else {
|
||||
atomic64_sub(sub, &blkg->delay_nsec);
|
||||
blkg->last_delay = cur - sub;
|
||||
}
|
||||
blkg->last_use = cur_use;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called when we want to actually walk up the hierarchy and check to
|
||||
* see if we need to throttle, and then actually throttle if there is some
|
||||
* accumulated delay. This should only be called upon return to user space so
|
||||
* we're not holding some lock that would induce a priority inversion.
|
||||
*/
|
||||
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
|
||||
{
|
||||
u64 now = ktime_to_ns(ktime_get());
|
||||
u64 exp;
|
||||
u64 delay_nsec = 0;
|
||||
int tok;
|
||||
|
||||
while (blkg->parent) {
|
||||
if (atomic_read(&blkg->use_delay)) {
|
||||
blkcg_scale_delay(blkg, now);
|
||||
delay_nsec = max_t(u64, delay_nsec,
|
||||
atomic64_read(&blkg->delay_nsec));
|
||||
}
|
||||
blkg = blkg->parent;
|
||||
}
|
||||
|
||||
if (!delay_nsec)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Let's not sleep for all eternity if we've amassed a huge delay.
|
||||
* Swapping or metadata IO can accumulate 10's of seconds worth of
|
||||
* delay, and we want userspace to be able to do _something_ so cap the
|
||||
* delays at 1 second. If there's 10's of seconds worth of delay then
|
||||
* the tasks will be delayed for 1 second for every syscall.
|
||||
*/
|
||||
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
|
||||
|
||||
/*
|
||||
* TODO: the use_memdelay flag is going to be for the upcoming psi stuff
|
||||
* that hasn't landed upstream yet. Once that stuff is in place we need
|
||||
* to do a psi_memstall_enter/leave if memdelay is set.
|
||||
*/
|
||||
|
||||
exp = ktime_add_ns(now, delay_nsec);
|
||||
tok = io_schedule_prepare();
|
||||
do {
|
||||
__set_current_state(TASK_KILLABLE);
|
||||
if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
|
||||
break;
|
||||
} while (!fatal_signal_pending(current));
|
||||
io_schedule_finish(tok);
|
||||
}
|
||||
|
||||
/**
|
||||
* blkcg_maybe_throttle_current - throttle the current task if it has been marked
|
||||
*
|
||||
* This is only called if we've been marked with set_notify_resume(). Obviously
|
||||
* we can be set_notify_resume() for reasons other than blkcg throttling, so we
|
||||
* check to see if current->throttle_queue is set and if not this doesn't do
|
||||
* anything. This should only ever be called by the resume code, it's not meant
|
||||
* to be called by people willy-nilly as it will actually do the work to
|
||||
* throttle the task if it is setup for throttling.
|
||||
*/
|
||||
void blkcg_maybe_throttle_current(void)
|
||||
{
|
||||
struct request_queue *q = current->throttle_queue;
|
||||
struct cgroup_subsys_state *css;
|
||||
struct blkcg *blkcg;
|
||||
struct blkcg_gq *blkg;
|
||||
bool use_memdelay = current->use_memdelay;
|
||||
|
||||
if (!q)
|
||||
return;
|
||||
|
||||
current->throttle_queue = NULL;
|
||||
current->use_memdelay = false;
|
||||
|
||||
rcu_read_lock();
|
||||
css = kthread_blkcg();
|
||||
if (css)
|
||||
blkcg = css_to_blkcg(css);
|
||||
else
|
||||
blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
|
||||
|
||||
if (!blkcg)
|
||||
goto out;
|
||||
blkg = blkg_lookup(blkcg, q);
|
||||
if (!blkg)
|
||||
goto out;
|
||||
blkg = blkg_try_get(blkg);
|
||||
if (!blkg)
|
||||
goto out;
|
||||
rcu_read_unlock();
|
||||
|
||||
blkcg_maybe_throttle_blkg(blkg, use_memdelay);
|
||||
blkg_put(blkg);
|
||||
blk_put_queue(q);
|
||||
return;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
blk_put_queue(q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
|
||||
|
||||
/**
|
||||
* blkcg_schedule_throttle - this task needs to check for throttling
|
||||
* @q - the request queue IO was submitted on
|
||||
* @use_memdelay - do we charge this to memory delay for PSI
|
||||
*
|
||||
* This is called by the IO controller when we know there's delay accumulated
|
||||
* for the blkg for this task. We do not pass the blkg because there are places
|
||||
* we call this that may not have that information, the swapping code for
|
||||
* instance will only have a request_queue at that point. This set's the
|
||||
* notify_resume for the task to check and see if it requires throttling before
|
||||
* returning to user space.
|
||||
*
|
||||
* We will only schedule once per syscall. You can call this over and over
|
||||
* again and it will only do the check once upon return to user space, and only
|
||||
* throttle once. If the task needs to be throttled again it'll need to be
|
||||
* re-set at the next time we see the task.
|
||||
*/
|
||||
void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
|
||||
{
|
||||
if (unlikely(current->flags & PF_KTHREAD))
|
||||
return;
|
||||
|
||||
if (!blk_get_queue(q))
|
||||
return;
|
||||
|
||||
if (current->throttle_queue)
|
||||
blk_put_queue(current->throttle_queue);
|
||||
current->throttle_queue = q;
|
||||
if (use_memdelay)
|
||||
current->use_memdelay = use_memdelay;
|
||||
set_notify_resume(current);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
|
||||
|
||||
/**
|
||||
* blkcg_add_delay - add delay to this blkg
|
||||
* @now - the current time in nanoseconds
|
||||
* @delta - how many nanoseconds of delay to add
|
||||
*
|
||||
* Charge @delta to the blkg's current delay accumulation. This is used to
|
||||
* throttle tasks if an IO controller thinks we need more throttling.
|
||||
*/
|
||||
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
|
||||
{
|
||||
blkcg_scale_delay(blkg, now);
|
||||
atomic64_add(delta, &blkg->delay_nsec);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkcg_add_delay);
|
||||
|
||||
module_param(blkcg_debug_stats, bool, 0644);
|
||||
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
|
||||
|
106
block/blk-core.c
106
block/blk-core.c
@ -42,7 +42,7 @@
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-wbt.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct dentry *blk_debugfs_root;
|
||||
@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
|
||||
|
||||
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
|
||||
void blk_exit_queue(struct request_queue *q)
|
||||
{
|
||||
/*
|
||||
* Since the I/O scheduler exit code may access cgroup information,
|
||||
* perform I/O scheduler exit before disassociating from the block
|
||||
* cgroup controller.
|
||||
*/
|
||||
if (q->elevator) {
|
||||
ioc_clear_queue(q);
|
||||
elevator_exit(q, q->elevator);
|
||||
q->elevator = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all references to @q from the block cgroup controller before
|
||||
* restoring @q->queue_lock to avoid that restoring this pointer causes
|
||||
* e.g. blkcg_print_blkgs() to crash.
|
||||
*/
|
||||
blkcg_exit_queue(q);
|
||||
|
||||
/*
|
||||
* Since the cgroup code may dereference the @q->backing_dev_info
|
||||
* pointer, only decrease its reference count after having removed the
|
||||
* association with the block cgroup controller.
|
||||
*/
|
||||
bdi_put(q->backing_dev_info);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_cleanup_queue - shutdown a request queue
|
||||
* @q: request queue to shutdown
|
||||
@ -762,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q)
|
||||
* make sure all in-progress dispatch are completed because
|
||||
* blk_freeze_queue() can only complete all requests, and
|
||||
* dispatch may still be in-progress since we dispatch requests
|
||||
* from more than one contexts
|
||||
* from more than one contexts.
|
||||
*
|
||||
* No need to quiesce queue if it isn't initialized yet since
|
||||
* blk_freeze_queue() should be enough for cases of passthrough
|
||||
* request.
|
||||
*/
|
||||
if (q->mq_ops)
|
||||
if (q->mq_ops && blk_queue_init_done(q))
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
/* for synchronous bio-based driver finish in-flight integrity i/o */
|
||||
@ -780,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q)
|
||||
*/
|
||||
WARN_ON_ONCE(q->kobj.state_in_sysfs);
|
||||
|
||||
/*
|
||||
* Since the I/O scheduler exit code may access cgroup information,
|
||||
* perform I/O scheduler exit before disassociating from the block
|
||||
* cgroup controller.
|
||||
*/
|
||||
if (q->elevator) {
|
||||
ioc_clear_queue(q);
|
||||
elevator_exit(q, q->elevator);
|
||||
q->elevator = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all references to @q from the block cgroup controller before
|
||||
* restoring @q->queue_lock to avoid that restoring this pointer causes
|
||||
* e.g. blkcg_print_blkgs() to crash.
|
||||
*/
|
||||
blkcg_exit_queue(q);
|
||||
|
||||
/*
|
||||
* Since the cgroup code may dereference the @q->backing_dev_info
|
||||
* pointer, only decrease its reference count after having removed the
|
||||
* association with the block cgroup controller.
|
||||
*/
|
||||
bdi_put(q->backing_dev_info);
|
||||
blk_exit_queue(q);
|
||||
|
||||
if (q->mq_ops)
|
||||
blk_mq_free_queue(q);
|
||||
@ -1180,6 +1190,7 @@ int blk_init_allocated_queue(struct request_queue *q)
|
||||
q->exit_rq_fn(q, q->fq->flush_rq);
|
||||
out_free_flush_queue:
|
||||
blk_free_flush_queue(q->fq);
|
||||
q->fq = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_init_allocated_queue);
|
||||
@ -1641,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
|
||||
blk_delete_timer(rq);
|
||||
blk_clear_rq_complete(rq);
|
||||
trace_block_rq_requeue(q, rq);
|
||||
wbt_requeue(q->rq_wb, rq);
|
||||
rq_qos_requeue(q, rq);
|
||||
|
||||
if (rq->rq_flags & RQF_QUEUED)
|
||||
blk_queue_end_tag(q, rq);
|
||||
@ -1748,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
|
||||
/* this is a bio leak */
|
||||
WARN_ON(req->bio != NULL);
|
||||
|
||||
wbt_done(q->rq_wb, req);
|
||||
rq_qos_done(q, req);
|
||||
|
||||
/*
|
||||
* Request may not have originated from ll_rw_blk. if not,
|
||||
@ -1982,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
||||
int where = ELEVATOR_INSERT_SORT;
|
||||
struct request *req, *free;
|
||||
unsigned int request_count = 0;
|
||||
unsigned int wb_acct;
|
||||
|
||||
/*
|
||||
* low level driver can indicate that it wants pages above a
|
||||
@ -2040,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
||||
}
|
||||
|
||||
get_rq:
|
||||
wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
|
||||
rq_qos_throttle(q, bio, q->queue_lock);
|
||||
|
||||
/*
|
||||
* Grab a free request. This is might sleep but can not fail.
|
||||
@ -2050,7 +2060,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
||||
req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
|
||||
if (IS_ERR(req)) {
|
||||
blk_queue_exit(q);
|
||||
__wbt_done(q->rq_wb, wb_acct);
|
||||
rq_qos_cleanup(q, bio);
|
||||
if (PTR_ERR(req) == -ENOMEM)
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
else
|
||||
@ -2059,7 +2069,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
wbt_track(req, wb_acct);
|
||||
rq_qos_track(q, req, bio);
|
||||
|
||||
/*
|
||||
* After dropping the lock and possibly sleeping here, our request
|
||||
@ -2700,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
|
||||
void blk_account_io_completion(struct request *req, unsigned int bytes)
|
||||
{
|
||||
if (blk_do_io_stat(req)) {
|
||||
const int rw = rq_data_dir(req);
|
||||
const int sgrp = op_stat_group(req_op(req));
|
||||
struct hd_struct *part;
|
||||
int cpu;
|
||||
|
||||
cpu = part_stat_lock();
|
||||
part = req->part;
|
||||
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
|
||||
part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
|
||||
part_stat_unlock();
|
||||
}
|
||||
}
|
||||
@ -2720,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now)
|
||||
*/
|
||||
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
|
||||
unsigned long duration;
|
||||
const int rw = rq_data_dir(req);
|
||||
const int sgrp = op_stat_group(req_op(req));
|
||||
struct hd_struct *part;
|
||||
int cpu;
|
||||
|
||||
@ -2728,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now)
|
||||
cpu = part_stat_lock();
|
||||
part = req->part;
|
||||
|
||||
part_stat_inc(cpu, part, ios[rw]);
|
||||
part_stat_add(cpu, part, ticks[rw], duration);
|
||||
part_stat_inc(cpu, part, ios[sgrp]);
|
||||
part_stat_add(cpu, part, ticks[sgrp], duration);
|
||||
part_round_stats(req->q, cpu, part);
|
||||
part_dec_in_flight(req->q, part, rw);
|
||||
part_dec_in_flight(req->q, part, rq_data_dir(req));
|
||||
|
||||
hd_struct_put(part);
|
||||
part_stat_unlock();
|
||||
@ -2751,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq)
|
||||
return rq->rq_flags & RQF_PM;
|
||||
case RPM_SUSPENDED:
|
||||
return false;
|
||||
}
|
||||
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static bool blk_pm_allow_request(struct request *rq)
|
||||
@ -2980,7 +2990,7 @@ void blk_start_request(struct request *req)
|
||||
req->throtl_size = blk_rq_sectors(req);
|
||||
#endif
|
||||
req->rq_flags |= RQF_STATS;
|
||||
wbt_issue(req->q->rq_wb, req);
|
||||
rq_qos_issue(req->q, req);
|
||||
}
|
||||
|
||||
BUG_ON(blk_rq_is_complete(req));
|
||||
@ -3053,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
|
||||
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
|
||||
* %false return from this function.
|
||||
*
|
||||
* Note:
|
||||
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
|
||||
* blk_rq_bytes() and in blk_update_request().
|
||||
*
|
||||
* Return:
|
||||
* %false - this request doesn't have any more data
|
||||
* %true - this request has more data
|
||||
@ -3200,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
|
||||
blk_account_io_done(req, now);
|
||||
|
||||
if (req->end_io) {
|
||||
wbt_done(req->q->rq_wb, req);
|
||||
rq_qos_done(q, req);
|
||||
req->end_io(req, error);
|
||||
} else {
|
||||
if (blk_bidi_rq(req))
|
||||
@ -3763,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug);
|
||||
*/
|
||||
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
|
||||
{
|
||||
/* not support for RQF_PM and ->rpm_status in blk-mq yet */
|
||||
if (q->mq_ops)
|
||||
/* Don't enable runtime PM for blk-mq until it is ready */
|
||||
if (q->mq_ops) {
|
||||
pm_runtime_disable(dev);
|
||||
return;
|
||||
}
|
||||
|
||||
q->dev = dev;
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
|
@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
|
||||
atomic_set(&ioc->nr_tasks, 1);
|
||||
atomic_set(&ioc->active_ref, 1);
|
||||
spin_lock_init(&ioc->lock);
|
||||
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
|
||||
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
|
||||
INIT_HLIST_HEAD(&ioc->icq_list);
|
||||
INIT_WORK(&ioc->release_work, ioc_release_fn);
|
||||
|
||||
|
955
block/blk-iolatency.c
Normal file
955
block/blk-iolatency.c
Normal file
@ -0,0 +1,955 @@
|
||||
/*
|
||||
* Block rq-qos base io controller
|
||||
*
|
||||
* This works similar to wbt with a few exceptions
|
||||
*
|
||||
* - It's bio based, so the latency covers the whole block layer in addition to
|
||||
* the actual io.
|
||||
* - We will throttle all IO that comes in here if we need to.
|
||||
* - We use the mean latency over the 100ms window. This is because writes can
|
||||
* be particularly fast, which could give us a false sense of the impact of
|
||||
* other workloads on our protected workload.
|
||||
* - By default there's no throttling, we set the queue_depth to UINT_MAX so
|
||||
* that we can have as many outstanding bio's as we're allowed to. Only at
|
||||
* throttle time do we pay attention to the actual queue depth.
|
||||
*
|
||||
* The hierarchy works like the cpu controller does, we track the latency at
|
||||
* every configured node, and each configured node has it's own independent
|
||||
* queue depth. This means that we only care about our latency targets at the
|
||||
* peer level. Some group at the bottom of the hierarchy isn't going to affect
|
||||
* a group at the end of some other path if we're only configred at leaf level.
|
||||
*
|
||||
* Consider the following
|
||||
*
|
||||
* root blkg
|
||||
* / \
|
||||
* fast (target=5ms) slow (target=10ms)
|
||||
* / \ / \
|
||||
* a b normal(15ms) unloved
|
||||
*
|
||||
* "a" and "b" have no target, but their combined io under "fast" cannot exceed
|
||||
* an average latency of 5ms. If it does then we will throttle the "slow"
|
||||
* group. In the case of "normal", if it exceeds its 15ms target, we will
|
||||
* throttle "unloved", but nobody else.
|
||||
*
|
||||
* In this example "fast", "slow", and "normal" will be the only groups actually
|
||||
* accounting their io latencies. We have to walk up the heirarchy to the root
|
||||
* on every submit and complete so we can do the appropriate stat recording and
|
||||
* adjust the queue depth of ourselves if needed.
|
||||
*
|
||||
* There are 2 ways we throttle IO.
|
||||
*
|
||||
* 1) Queue depth throttling. As we throttle down we will adjust the maximum
|
||||
* number of IO's we're allowed to have in flight. This starts at (u64)-1 down
|
||||
* to 1. If the group is only ever submitting IO for itself then this is the
|
||||
* only way we throttle.
|
||||
*
|
||||
* 2) Induced delay throttling. This is for the case that a group is generating
|
||||
* IO that has to be issued by the root cg to avoid priority inversion. So think
|
||||
* REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot
|
||||
* of work done for us on behalf of the root cg and are being asked to scale
|
||||
* down more then we induce a latency at userspace return. We accumulate the
|
||||
* total amount of time we need to be punished by doing
|
||||
*
|
||||
* total_time += min_lat_nsec - actual_io_completion
|
||||
*
|
||||
* and then at throttle time will do
|
||||
*
|
||||
* throttle_time = min(total_time, NSEC_PER_SEC)
|
||||
*
|
||||
* This induced delay will throttle back the activity that is generating the
|
||||
* root cg issued io's, wethere that's some metadata intensive operation or the
|
||||
* group is using so much memory that it is pushing us into swap.
|
||||
*
|
||||
* Copyright (C) 2018 Josef Bacik
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/blk_types.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/sched/loadavg.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <trace/events/block.h>
|
||||
#include "blk-rq-qos.h"
|
||||
#include "blk-stat.h"
|
||||
|
||||
#define DEFAULT_SCALE_COOKIE 1000000U
|
||||
|
||||
static struct blkcg_policy blkcg_policy_iolatency;
|
||||
struct iolatency_grp;
|
||||
|
||||
struct blk_iolatency {
|
||||
struct rq_qos rqos;
|
||||
struct timer_list timer;
|
||||
atomic_t enabled;
|
||||
};
|
||||
|
||||
static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
|
||||
{
|
||||
return container_of(rqos, struct blk_iolatency, rqos);
|
||||
}
|
||||
|
||||
static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
|
||||
{
|
||||
return atomic_read(&blkiolat->enabled) > 0;
|
||||
}
|
||||
|
||||
struct child_latency_info {
|
||||
spinlock_t lock;
|
||||
|
||||
/* Last time we adjusted the scale of everybody. */
|
||||
u64 last_scale_event;
|
||||
|
||||
/* The latency that we missed. */
|
||||
u64 scale_lat;
|
||||
|
||||
/* Total io's from all of our children for the last summation. */
|
||||
u64 nr_samples;
|
||||
|
||||
/* The guy who actually changed the latency numbers. */
|
||||
struct iolatency_grp *scale_grp;
|
||||
|
||||
/* Cookie to tell if we need to scale up or down. */
|
||||
atomic_t scale_cookie;
|
||||
};
|
||||
|
||||
struct iolatency_grp {
|
||||
struct blkg_policy_data pd;
|
||||
struct blk_rq_stat __percpu *stats;
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct rq_depth rq_depth;
|
||||
struct rq_wait rq_wait;
|
||||
atomic64_t window_start;
|
||||
atomic_t scale_cookie;
|
||||
u64 min_lat_nsec;
|
||||
u64 cur_win_nsec;
|
||||
|
||||
/* total running average of our io latency. */
|
||||
u64 lat_avg;
|
||||
|
||||
/* Our current number of IO's for the last summation. */
|
||||
u64 nr_samples;
|
||||
|
||||
struct child_latency_info child_lat;
|
||||
};
|
||||
|
||||
#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
|
||||
#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
|
||||
/*
|
||||
* These are the constants used to fake the fixed-point moving average
|
||||
* calculation just like load average. The call to CALC_LOAD folds
|
||||
* (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
|
||||
* window size is bucketed to try to approximately calculate average
|
||||
* latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
|
||||
* elapse immediately. Note, windows only elapse with IO activity. Idle
|
||||
* periods extend the most recent window.
|
||||
*/
|
||||
#define BLKIOLATENCY_NR_EXP_FACTORS 5
|
||||
#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
|
||||
(BLKIOLATENCY_NR_EXP_FACTORS - 1))
|
||||
static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
|
||||
2045, // exp(1/600) - 600 samples
|
||||
2039, // exp(1/240) - 240 samples
|
||||
2031, // exp(1/120) - 120 samples
|
||||
2023, // exp(1/80) - 80 samples
|
||||
2014, // exp(1/60) - 60 samples
|
||||
};
|
||||
|
||||
static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
|
||||
{
|
||||
return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
|
||||
}
|
||||
|
||||
static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
|
||||
{
|
||||
return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
|
||||
}
|
||||
|
||||
static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
|
||||
{
|
||||
return pd_to_blkg(&iolat->pd);
|
||||
}
|
||||
|
||||
static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
|
||||
wait_queue_entry_t *wait,
|
||||
bool first_block)
|
||||
{
|
||||
struct rq_wait *rqw = &iolat->rq_wait;
|
||||
|
||||
if (first_block && waitqueue_active(&rqw->wait) &&
|
||||
rqw->wait.head.next != &wait->entry)
|
||||
return false;
|
||||
return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
|
||||
}
|
||||
|
||||
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
|
||||
struct iolatency_grp *iolat,
|
||||
spinlock_t *lock, bool issue_as_root,
|
||||
bool use_memdelay)
|
||||
__releases(lock)
|
||||
__acquires(lock)
|
||||
{
|
||||
struct rq_wait *rqw = &iolat->rq_wait;
|
||||
unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
|
||||
DEFINE_WAIT(wait);
|
||||
bool first_block = true;
|
||||
|
||||
if (use_delay)
|
||||
blkcg_schedule_throttle(rqos->q, use_memdelay);
|
||||
|
||||
/*
|
||||
* To avoid priority inversions we want to just take a slot if we are
|
||||
* issuing as root. If we're being killed off there's no point in
|
||||
* delaying things, we may have been killed by OOM so throttling may
|
||||
* make recovery take even longer, so just let the IO's through so the
|
||||
* task can go away.
|
||||
*/
|
||||
if (issue_as_root || fatal_signal_pending(current)) {
|
||||
atomic_inc(&rqw->inflight);
|
||||
return;
|
||||
}
|
||||
|
||||
if (iolatency_may_queue(iolat, &wait, first_block))
|
||||
return;
|
||||
|
||||
do {
|
||||
prepare_to_wait_exclusive(&rqw->wait, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
|
||||
if (iolatency_may_queue(iolat, &wait, first_block))
|
||||
break;
|
||||
first_block = false;
|
||||
|
||||
if (lock) {
|
||||
spin_unlock_irq(lock);
|
||||
io_schedule();
|
||||
spin_lock_irq(lock);
|
||||
} else {
|
||||
io_schedule();
|
||||
}
|
||||
} while (1);
|
||||
|
||||
finish_wait(&rqw->wait, &wait);
|
||||
}
|
||||
|
||||
#define SCALE_DOWN_FACTOR 2
|
||||
#define SCALE_UP_FACTOR 4
|
||||
|
||||
static inline unsigned long scale_amount(unsigned long qd, bool up)
|
||||
{
|
||||
return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
|
||||
}
|
||||
|
||||
/*
|
||||
* We scale the qd down faster than we scale up, so we need to use this helper
|
||||
* to adjust the scale_cookie accordingly so we don't prematurely get
|
||||
* scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
|
||||
*
|
||||
* Each group has their own local copy of the last scale cookie they saw, so if
|
||||
* the global scale cookie goes up or down they know which way they need to go
|
||||
* based on their last knowledge of it.
|
||||
*/
|
||||
static void scale_cookie_change(struct blk_iolatency *blkiolat,
|
||||
struct child_latency_info *lat_info,
|
||||
bool up)
|
||||
{
|
||||
unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
|
||||
unsigned long scale = scale_amount(qd, up);
|
||||
unsigned long old = atomic_read(&lat_info->scale_cookie);
|
||||
unsigned long max_scale = qd << 1;
|
||||
unsigned long diff = 0;
|
||||
|
||||
if (old < DEFAULT_SCALE_COOKIE)
|
||||
diff = DEFAULT_SCALE_COOKIE - old;
|
||||
|
||||
if (up) {
|
||||
if (scale + old > DEFAULT_SCALE_COOKIE)
|
||||
atomic_set(&lat_info->scale_cookie,
|
||||
DEFAULT_SCALE_COOKIE);
|
||||
else if (diff > qd)
|
||||
atomic_inc(&lat_info->scale_cookie);
|
||||
else
|
||||
atomic_add(scale, &lat_info->scale_cookie);
|
||||
} else {
|
||||
/*
|
||||
* We don't want to dig a hole so deep that it takes us hours to
|
||||
* dig out of it. Just enough that we don't throttle/unthrottle
|
||||
* with jagged workloads but can still unthrottle once pressure
|
||||
* has sufficiently dissipated.
|
||||
*/
|
||||
if (diff > qd) {
|
||||
if (diff < max_scale)
|
||||
atomic_dec(&lat_info->scale_cookie);
|
||||
} else {
|
||||
atomic_sub(scale, &lat_info->scale_cookie);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
|
||||
* queue depth at a time so we don't get wild swings and hopefully dial in to
|
||||
* fairer distribution of the overall queue depth.
|
||||
*/
|
||||
static void scale_change(struct iolatency_grp *iolat, bool up)
|
||||
{
|
||||
unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
|
||||
unsigned long scale = scale_amount(qd, up);
|
||||
unsigned long old = iolat->rq_depth.max_depth;
|
||||
bool changed = false;
|
||||
|
||||
if (old > qd)
|
||||
old = qd;
|
||||
|
||||
if (up) {
|
||||
if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
|
||||
return;
|
||||
|
||||
if (old < qd) {
|
||||
changed = true;
|
||||
old += scale;
|
||||
old = min(old, qd);
|
||||
iolat->rq_depth.max_depth = old;
|
||||
wake_up_all(&iolat->rq_wait.wait);
|
||||
}
|
||||
} else if (old > 1) {
|
||||
old >>= 1;
|
||||
changed = true;
|
||||
iolat->rq_depth.max_depth = max(old, 1UL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check our parent and see if the scale cookie has changed. */
|
||||
static void check_scale_change(struct iolatency_grp *iolat)
|
||||
{
|
||||
struct iolatency_grp *parent;
|
||||
struct child_latency_info *lat_info;
|
||||
unsigned int cur_cookie;
|
||||
unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
|
||||
u64 scale_lat;
|
||||
unsigned int old;
|
||||
int direction = 0;
|
||||
|
||||
if (lat_to_blkg(iolat)->parent == NULL)
|
||||
return;
|
||||
|
||||
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
|
||||
if (!parent)
|
||||
return;
|
||||
|
||||
lat_info = &parent->child_lat;
|
||||
cur_cookie = atomic_read(&lat_info->scale_cookie);
|
||||
scale_lat = READ_ONCE(lat_info->scale_lat);
|
||||
|
||||
if (cur_cookie < our_cookie)
|
||||
direction = -1;
|
||||
else if (cur_cookie > our_cookie)
|
||||
direction = 1;
|
||||
else
|
||||
return;
|
||||
|
||||
old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
|
||||
|
||||
/* Somebody beat us to the punch, just bail. */
|
||||
if (old != our_cookie)
|
||||
return;
|
||||
|
||||
if (direction < 0 && iolat->min_lat_nsec) {
|
||||
u64 samples_thresh;
|
||||
|
||||
if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Sometimes high priority groups are their own worst enemy, so
|
||||
* instead of taking it out on some poor other group that did 5%
|
||||
* or less of the IO's for the last summation just skip this
|
||||
* scale down event.
|
||||
*/
|
||||
samples_thresh = lat_info->nr_samples * 5;
|
||||
samples_thresh = div64_u64(samples_thresh, 100);
|
||||
if (iolat->nr_samples <= samples_thresh)
|
||||
return;
|
||||
}
|
||||
|
||||
/* We're as low as we can go. */
|
||||
if (iolat->rq_depth.max_depth == 1 && direction < 0) {
|
||||
blkcg_use_delay(lat_to_blkg(iolat));
|
||||
return;
|
||||
}
|
||||
|
||||
/* We're back to the default cookie, unthrottle all the things. */
|
||||
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
|
||||
blkcg_clear_delay(lat_to_blkg(iolat));
|
||||
iolat->rq_depth.max_depth = UINT_MAX;
|
||||
wake_up_all(&iolat->rq_wait.wait);
|
||||
return;
|
||||
}
|
||||
|
||||
scale_change(iolat, direction > 0);
|
||||
}
|
||||
|
||||
static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
|
||||
spinlock_t *lock)
|
||||
{
|
||||
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
|
||||
struct blkcg *blkcg;
|
||||
struct blkcg_gq *blkg;
|
||||
struct request_queue *q = rqos->q;
|
||||
bool issue_as_root = bio_issue_as_root_blkg(bio);
|
||||
|
||||
if (!blk_iolatency_enabled(blkiolat))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
blkcg = bio_blkcg(bio);
|
||||
bio_associate_blkcg(bio, &blkcg->css);
|
||||
blkg = blkg_lookup(blkcg, q);
|
||||
if (unlikely(!blkg)) {
|
||||
if (!lock)
|
||||
spin_lock_irq(q->queue_lock);
|
||||
blkg = blkg_lookup_create(blkcg, q);
|
||||
if (IS_ERR(blkg))
|
||||
blkg = NULL;
|
||||
if (!lock)
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
if (!blkg)
|
||||
goto out;
|
||||
|
||||
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
|
||||
bio_associate_blkg(bio, blkg);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
while (blkg && blkg->parent) {
|
||||
struct iolatency_grp *iolat = blkg_to_lat(blkg);
|
||||
if (!iolat) {
|
||||
blkg = blkg->parent;
|
||||
continue;
|
||||
}
|
||||
|
||||
check_scale_change(iolat);
|
||||
__blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
|
||||
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
|
||||
blkg = blkg->parent;
|
||||
}
|
||||
if (!timer_pending(&blkiolat->timer))
|
||||
mod_timer(&blkiolat->timer, jiffies + HZ);
|
||||
}
|
||||
|
||||
static void iolatency_record_time(struct iolatency_grp *iolat,
|
||||
struct bio_issue *issue, u64 now,
|
||||
bool issue_as_root)
|
||||
{
|
||||
struct blk_rq_stat *rq_stat;
|
||||
u64 start = bio_issue_time(issue);
|
||||
u64 req_time;
|
||||
|
||||
/*
|
||||
* Have to do this so we are truncated to the correct time that our
|
||||
* issue is truncated to.
|
||||
*/
|
||||
now = __bio_issue_time(now);
|
||||
|
||||
if (now <= start)
|
||||
return;
|
||||
|
||||
req_time = now - start;
|
||||
|
||||
/*
|
||||
* We don't want to count issue_as_root bio's in the cgroups latency
|
||||
* statistics as it could skew the numbers downwards.
|
||||
*/
|
||||
if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
|
||||
u64 sub = iolat->min_lat_nsec;
|
||||
if (req_time < sub)
|
||||
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
|
||||
return;
|
||||
}
|
||||
|
||||
rq_stat = get_cpu_ptr(iolat->stats);
|
||||
blk_rq_stat_add(rq_stat, req_time);
|
||||
put_cpu_ptr(rq_stat);
|
||||
}
|
||||
|
||||
#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
|
||||
#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
|
||||
|
||||
static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
|
||||
{
|
||||
struct blkcg_gq *blkg = lat_to_blkg(iolat);
|
||||
struct iolatency_grp *parent;
|
||||
struct child_latency_info *lat_info;
|
||||
struct blk_rq_stat stat;
|
||||
unsigned long flags;
|
||||
int cpu, exp_idx;
|
||||
|
||||
blk_rq_stat_init(&stat);
|
||||
preempt_disable();
|
||||
for_each_online_cpu(cpu) {
|
||||
struct blk_rq_stat *s;
|
||||
s = per_cpu_ptr(iolat->stats, cpu);
|
||||
blk_rq_stat_sum(&stat, s);
|
||||
blk_rq_stat_init(s);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
parent = blkg_to_lat(blkg->parent);
|
||||
if (!parent)
|
||||
return;
|
||||
|
||||
lat_info = &parent->child_lat;
|
||||
|
||||
/*
|
||||
* CALC_LOAD takes in a number stored in fixed point representation.
|
||||
* Because we are using this for IO time in ns, the values stored
|
||||
* are significantly larger than the FIXED_1 denominator (2048).
|
||||
* Therefore, rounding errors in the calculation are negligible and
|
||||
* can be ignored.
|
||||
*/
|
||||
exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
|
||||
div64_u64(iolat->cur_win_nsec,
|
||||
BLKIOLATENCY_EXP_BUCKET_SIZE));
|
||||
CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
|
||||
|
||||
/* Everything is ok and we don't need to adjust the scale. */
|
||||
if (stat.mean <= iolat->min_lat_nsec &&
|
||||
atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
|
||||
return;
|
||||
|
||||
/* Somebody beat us to the punch, just bail. */
|
||||
spin_lock_irqsave(&lat_info->lock, flags);
|
||||
lat_info->nr_samples -= iolat->nr_samples;
|
||||
lat_info->nr_samples += stat.nr_samples;
|
||||
iolat->nr_samples = stat.nr_samples;
|
||||
|
||||
if ((lat_info->last_scale_event >= now ||
|
||||
now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
|
||||
lat_info->scale_lat <= iolat->min_lat_nsec)
|
||||
goto out;
|
||||
|
||||
if (stat.mean <= iolat->min_lat_nsec &&
|
||||
stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
|
||||
if (lat_info->scale_grp == iolat) {
|
||||
lat_info->last_scale_event = now;
|
||||
scale_cookie_change(iolat->blkiolat, lat_info, true);
|
||||
}
|
||||
} else if (stat.mean > iolat->min_lat_nsec) {
|
||||
lat_info->last_scale_event = now;
|
||||
if (!lat_info->scale_grp ||
|
||||
lat_info->scale_lat > iolat->min_lat_nsec) {
|
||||
WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
|
||||
lat_info->scale_grp = iolat;
|
||||
}
|
||||
scale_cookie_change(iolat->blkiolat, lat_info, false);
|
||||
}
|
||||
out:
|
||||
spin_unlock_irqrestore(&lat_info->lock, flags);
|
||||
}
|
||||
|
||||
static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
|
||||
{
|
||||
struct blkcg_gq *blkg;
|
||||
struct rq_wait *rqw;
|
||||
struct iolatency_grp *iolat;
|
||||
u64 window_start;
|
||||
u64 now = ktime_to_ns(ktime_get());
|
||||
bool issue_as_root = bio_issue_as_root_blkg(bio);
|
||||
bool enabled = false;
|
||||
|
||||
blkg = bio->bi_blkg;
|
||||
if (!blkg)
|
||||
return;
|
||||
|
||||
iolat = blkg_to_lat(bio->bi_blkg);
|
||||
if (!iolat)
|
||||
return;
|
||||
|
||||
enabled = blk_iolatency_enabled(iolat->blkiolat);
|
||||
while (blkg && blkg->parent) {
|
||||
iolat = blkg_to_lat(blkg);
|
||||
if (!iolat) {
|
||||
blkg = blkg->parent;
|
||||
continue;
|
||||
}
|
||||
rqw = &iolat->rq_wait;
|
||||
|
||||
atomic_dec(&rqw->inflight);
|
||||
if (!enabled || iolat->min_lat_nsec == 0)
|
||||
goto next;
|
||||
iolatency_record_time(iolat, &bio->bi_issue, now,
|
||||
issue_as_root);
|
||||
window_start = atomic64_read(&iolat->window_start);
|
||||
if (now > window_start &&
|
||||
(now - window_start) >= iolat->cur_win_nsec) {
|
||||
if (atomic64_cmpxchg(&iolat->window_start,
|
||||
window_start, now) == window_start)
|
||||
iolatency_check_latencies(iolat, now);
|
||||
}
|
||||
next:
|
||||
wake_up(&rqw->wait);
|
||||
blkg = blkg->parent;
|
||||
}
|
||||
}
|
||||
|
||||
static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
|
||||
{
|
||||
struct blkcg_gq *blkg;
|
||||
|
||||
blkg = bio->bi_blkg;
|
||||
while (blkg && blkg->parent) {
|
||||
struct rq_wait *rqw;
|
||||
struct iolatency_grp *iolat;
|
||||
|
||||
iolat = blkg_to_lat(blkg);
|
||||
if (!iolat)
|
||||
goto next;
|
||||
|
||||
rqw = &iolat->rq_wait;
|
||||
atomic_dec(&rqw->inflight);
|
||||
wake_up(&rqw->wait);
|
||||
next:
|
||||
blkg = blkg->parent;
|
||||
}
|
||||
}
|
||||
|
||||
static void blkcg_iolatency_exit(struct rq_qos *rqos)
|
||||
{
|
||||
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
|
||||
|
||||
del_timer_sync(&blkiolat->timer);
|
||||
blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
|
||||
kfree(blkiolat);
|
||||
}
|
||||
|
||||
static struct rq_qos_ops blkcg_iolatency_ops = {
|
||||
.throttle = blkcg_iolatency_throttle,
|
||||
.cleanup = blkcg_iolatency_cleanup,
|
||||
.done_bio = blkcg_iolatency_done_bio,
|
||||
.exit = blkcg_iolatency_exit,
|
||||
};
|
||||
|
||||
static void blkiolatency_timer_fn(struct timer_list *t)
|
||||
{
|
||||
struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
|
||||
struct blkcg_gq *blkg;
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
u64 now = ktime_to_ns(ktime_get());
|
||||
|
||||
rcu_read_lock();
|
||||
blkg_for_each_descendant_pre(blkg, pos_css,
|
||||
blkiolat->rqos.q->root_blkg) {
|
||||
struct iolatency_grp *iolat;
|
||||
struct child_latency_info *lat_info;
|
||||
unsigned long flags;
|
||||
u64 cookie;
|
||||
|
||||
/*
|
||||
* We could be exiting, don't access the pd unless we have a
|
||||
* ref on the blkg.
|
||||
*/
|
||||
if (!blkg_try_get(blkg))
|
||||
continue;
|
||||
|
||||
iolat = blkg_to_lat(blkg);
|
||||
if (!iolat)
|
||||
goto next;
|
||||
|
||||
lat_info = &iolat->child_lat;
|
||||
cookie = atomic_read(&lat_info->scale_cookie);
|
||||
|
||||
if (cookie >= DEFAULT_SCALE_COOKIE)
|
||||
goto next;
|
||||
|
||||
spin_lock_irqsave(&lat_info->lock, flags);
|
||||
if (lat_info->last_scale_event >= now)
|
||||
goto next_lock;
|
||||
|
||||
/*
|
||||
* We scaled down but don't have a scale_grp, scale up and carry
|
||||
* on.
|
||||
*/
|
||||
if (lat_info->scale_grp == NULL) {
|
||||
scale_cookie_change(iolat->blkiolat, lat_info, true);
|
||||
goto next_lock;
|
||||
}
|
||||
|
||||
/*
|
||||
* It's been 5 seconds since our last scale event, clear the
|
||||
* scale grp in case the group that needed the scale down isn't
|
||||
* doing any IO currently.
|
||||
*/
|
||||
if (now - lat_info->last_scale_event >=
|
||||
((u64)NSEC_PER_SEC * 5))
|
||||
lat_info->scale_grp = NULL;
|
||||
next_lock:
|
||||
spin_unlock_irqrestore(&lat_info->lock, flags);
|
||||
next:
|
||||
blkg_put(blkg);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
int blk_iolatency_init(struct request_queue *q)
|
||||
{
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct rq_qos *rqos;
|
||||
int ret;
|
||||
|
||||
blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
|
||||
if (!blkiolat)
|
||||
return -ENOMEM;
|
||||
|
||||
rqos = &blkiolat->rqos;
|
||||
rqos->id = RQ_QOS_CGROUP;
|
||||
rqos->ops = &blkcg_iolatency_ops;
|
||||
rqos->q = q;
|
||||
|
||||
rq_qos_add(q, rqos);
|
||||
|
||||
ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
|
||||
if (ret) {
|
||||
rq_qos_del(q, rqos);
|
||||
kfree(blkiolat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
|
||||
{
|
||||
struct iolatency_grp *iolat = blkg_to_lat(blkg);
|
||||
struct blk_iolatency *blkiolat = iolat->blkiolat;
|
||||
u64 oldval = iolat->min_lat_nsec;
|
||||
|
||||
iolat->min_lat_nsec = val;
|
||||
iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
|
||||
iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
|
||||
BLKIOLATENCY_MAX_WIN_SIZE);
|
||||
|
||||
if (!oldval && val)
|
||||
atomic_inc(&blkiolat->enabled);
|
||||
if (oldval && !val)
|
||||
atomic_dec(&blkiolat->enabled);
|
||||
}
|
||||
|
||||
static void iolatency_clear_scaling(struct blkcg_gq *blkg)
|
||||
{
|
||||
if (blkg->parent) {
|
||||
struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
|
||||
struct child_latency_info *lat_info;
|
||||
if (!iolat)
|
||||
return;
|
||||
|
||||
lat_info = &iolat->child_lat;
|
||||
spin_lock(&lat_info->lock);
|
||||
atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
|
||||
lat_info->last_scale_event = 0;
|
||||
lat_info->scale_grp = NULL;
|
||||
lat_info->scale_lat = 0;
|
||||
spin_unlock(&lat_info->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, loff_t off)
|
||||
{
|
||||
struct blkcg *blkcg = css_to_blkcg(of_css(of));
|
||||
struct blkcg_gq *blkg;
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct blkg_conf_ctx ctx;
|
||||
struct iolatency_grp *iolat;
|
||||
char *p, *tok;
|
||||
u64 lat_val = 0;
|
||||
u64 oldval;
|
||||
int ret;
|
||||
|
||||
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
iolat = blkg_to_lat(ctx.blkg);
|
||||
blkiolat = iolat->blkiolat;
|
||||
p = ctx.body;
|
||||
|
||||
ret = -EINVAL;
|
||||
while ((tok = strsep(&p, " "))) {
|
||||
char key[16];
|
||||
char val[21]; /* 18446744073709551616 */
|
||||
|
||||
if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
|
||||
goto out;
|
||||
|
||||
if (!strcmp(key, "target")) {
|
||||
u64 v;
|
||||
|
||||
if (!strcmp(val, "max"))
|
||||
lat_val = 0;
|
||||
else if (sscanf(val, "%llu", &v) == 1)
|
||||
lat_val = v * NSEC_PER_USEC;
|
||||
else
|
||||
goto out;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Walk up the tree to see if our new val is lower than it should be. */
|
||||
blkg = ctx.blkg;
|
||||
oldval = iolat->min_lat_nsec;
|
||||
|
||||
iolatency_set_min_lat_nsec(blkg, lat_val);
|
||||
if (oldval != iolat->min_lat_nsec) {
|
||||
iolatency_clear_scaling(blkg);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
blkg_conf_finish(&ctx);
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
static u64 iolatency_prfill_limit(struct seq_file *sf,
|
||||
struct blkg_policy_data *pd, int off)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
const char *dname = blkg_dev_name(pd->blkg);
|
||||
|
||||
if (!dname || !iolat->min_lat_nsec)
|
||||
return 0;
|
||||
seq_printf(sf, "%s target=%llu\n",
|
||||
dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int iolatency_print_limit(struct seq_file *sf, void *v)
|
||||
{
|
||||
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
|
||||
iolatency_prfill_limit,
|
||||
&blkcg_policy_iolatency, seq_cft(sf)->private, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
|
||||
size_t size)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
|
||||
unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
|
||||
|
||||
if (iolat->rq_depth.max_depth == UINT_MAX)
|
||||
return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
|
||||
avg_lat, cur_win);
|
||||
|
||||
return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
|
||||
iolat->rq_depth.max_depth, avg_lat, cur_win);
|
||||
}
|
||||
|
||||
|
||||
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
|
||||
{
|
||||
struct iolatency_grp *iolat;
|
||||
|
||||
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
|
||||
if (!iolat)
|
||||
return NULL;
|
||||
iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
|
||||
__alignof__(struct blk_rq_stat), gfp);
|
||||
if (!iolat->stats) {
|
||||
kfree(iolat);
|
||||
return NULL;
|
||||
}
|
||||
return &iolat->pd;
|
||||
}
|
||||
|
||||
static void iolatency_pd_init(struct blkg_policy_data *pd)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
struct blkcg_gq *blkg = lat_to_blkg(iolat);
|
||||
struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
|
||||
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
|
||||
u64 now = ktime_to_ns(ktime_get());
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct blk_rq_stat *stat;
|
||||
stat = per_cpu_ptr(iolat->stats, cpu);
|
||||
blk_rq_stat_init(stat);
|
||||
}
|
||||
|
||||
rq_wait_init(&iolat->rq_wait);
|
||||
spin_lock_init(&iolat->child_lat.lock);
|
||||
iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
|
||||
iolat->rq_depth.max_depth = UINT_MAX;
|
||||
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
|
||||
iolat->blkiolat = blkiolat;
|
||||
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
|
||||
atomic64_set(&iolat->window_start, now);
|
||||
|
||||
/*
|
||||
* We init things in list order, so the pd for the parent may not be
|
||||
* init'ed yet for whatever reason.
|
||||
*/
|
||||
if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
|
||||
struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
|
||||
atomic_set(&iolat->scale_cookie,
|
||||
atomic_read(&parent->child_lat.scale_cookie));
|
||||
} else {
|
||||
atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
|
||||
}
|
||||
|
||||
atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
|
||||
}
|
||||
|
||||
static void iolatency_pd_offline(struct blkg_policy_data *pd)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
struct blkcg_gq *blkg = lat_to_blkg(iolat);
|
||||
|
||||
iolatency_set_min_lat_nsec(blkg, 0);
|
||||
iolatency_clear_scaling(blkg);
|
||||
}
|
||||
|
||||
static void iolatency_pd_free(struct blkg_policy_data *pd)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
free_percpu(iolat->stats);
|
||||
kfree(iolat);
|
||||
}
|
||||
|
||||
static struct cftype iolatency_files[] = {
|
||||
{
|
||||
.name = "latency",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.seq_show = iolatency_print_limit,
|
||||
.write = iolatency_set_limit,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
static struct blkcg_policy blkcg_policy_iolatency = {
|
||||
.dfl_cftypes = iolatency_files,
|
||||
.pd_alloc_fn = iolatency_pd_alloc,
|
||||
.pd_init_fn = iolatency_pd_init,
|
||||
.pd_offline_fn = iolatency_pd_offline,
|
||||
.pd_free_fn = iolatency_pd_free,
|
||||
.pd_stat_fn = iolatency_pd_stat,
|
||||
};
|
||||
|
||||
static int __init iolatency_init(void)
|
||||
{
|
||||
return blkcg_policy_register(&blkcg_policy_iolatency);
|
||||
}
|
||||
|
||||
static void __exit iolatency_exit(void)
|
||||
{
|
||||
return blkcg_policy_unregister(&blkcg_policy_iolatency);
|
||||
}
|
||||
|
||||
module_init(iolatency_init);
|
||||
module_exit(iolatency_exit);
|
@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||
*/
|
||||
req_sects = min_t(sector_t, nr_sects,
|
||||
q->limits.max_discard_sectors);
|
||||
if (!req_sects)
|
||||
goto fail;
|
||||
if (req_sects > UINT_MAX >> 9)
|
||||
req_sects = UINT_MAX >> 9;
|
||||
|
||||
@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
|
||||
|
||||
*biop = bio;
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
if (bio) {
|
||||
submit_bio_wait(bio);
|
||||
bio_put(bio);
|
||||
}
|
||||
*biop = NULL;
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
EXPORT_SYMBOL(__blkdev_issue_discard);
|
||||
|
||||
|
24
block/blk-mq-debugfs-zoned.c
Normal file
24
block/blk-mq-debugfs-zoned.c
Normal file
@ -0,0 +1,24 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include "blk-mq-debugfs.h"
|
||||
|
||||
int queue_zone_wlock_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
unsigned int i;
|
||||
|
||||
if (!q->seq_zones_wlock)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < q->nr_zones; i++)
|
||||
if (test_bit(i, q->seq_zones_wlock))
|
||||
seq_printf(m, "%u\n", i);
|
||||
|
||||
return 0;
|
||||
}
|
@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
|
||||
return count;
|
||||
}
|
||||
|
||||
static int queue_zone_wlock_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
unsigned int i;
|
||||
|
||||
if (!q->seq_zones_wlock)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < blk_queue_nr_zones(q); i++)
|
||||
if (test_bit(i, q->seq_zones_wlock))
|
||||
seq_printf(m, "%u\n", i);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
|
||||
{ "poll_stat", 0400, queue_poll_stat_show },
|
||||
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
|
||||
@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
seq_printf(m, "%u\n", hctx->dispatch_busy);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
|
||||
__acquires(&ctx->lock)
|
||||
{
|
||||
@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
|
||||
{"queued", 0600, hctx_queued_show, hctx_queued_write},
|
||||
{"run", 0600, hctx_run_show, hctx_run_write},
|
||||
{"active", 0400, hctx_active_show},
|
||||
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
|
||||
{},
|
||||
};
|
||||
|
||||
|
@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_BLK_DEBUG_FS_ZONED
|
||||
int queue_zone_wlock_show(void *data, struct seq_file *m);
|
||||
#else
|
||||
static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <linux/pci.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "blk-mq.h"
|
||||
|
||||
/**
|
||||
* blk_mq_pci_map_queues - provide a default queue mapping for PCI device
|
||||
* @set: tagset to provide the mapping for
|
||||
@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
|
||||
|
||||
fallback:
|
||||
WARN_ON_ONCE(set->nr_hw_queues > 1);
|
||||
for_each_possible_cpu(cpu)
|
||||
set->mq_map[cpu] = 0;
|
||||
blk_mq_clear_mq_map(set);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
|
||||
|
@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
return;
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
||||
if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
atomic_inc(&q->shared_hctx_restart);
|
||||
} else
|
||||
set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
|
||||
}
|
||||
|
||||
static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
|
||||
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
return false;
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
||||
if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
atomic_dec(&q->shared_hctx_restart);
|
||||
} else
|
||||
return;
|
||||
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
|
||||
|
||||
return blk_mq_run_hw_queue(hctx, true);
|
||||
blk_mq_run_hw_queue(hctx, true);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
|
||||
}
|
||||
} else if (has_sched_dispatch) {
|
||||
blk_mq_do_dispatch_sched(hctx);
|
||||
} else if (q->mq_ops->get_budget) {
|
||||
/*
|
||||
* If we need to get budget before queuing request, we
|
||||
* dequeue request one by one from sw queue for avoiding
|
||||
* to mess up I/O merge when dispatch runs out of resource.
|
||||
*
|
||||
* TODO: get more budgets, and dequeue more requests in
|
||||
* one time.
|
||||
*/
|
||||
} else if (hctx->dispatch_busy) {
|
||||
/* dequeue request one by one from sw queue if queue is busy */
|
||||
blk_mq_do_dispatch_ctx(hctx);
|
||||
} else {
|
||||
blk_mq_flush_busy_ctxs(hctx, &rq_list);
|
||||
@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
|
||||
return e->type->ops.mq.bio_merge(hctx, bio);
|
||||
}
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
|
||||
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
|
||||
!list_empty_careful(&ctx->rq_list)) {
|
||||
/* default per sw-queue merge */
|
||||
spin_lock(&ctx->lock);
|
||||
ret = blk_mq_attempt_merge(q, ctx, bio);
|
||||
@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
|
||||
* @pos: loop cursor.
|
||||
* @skip: the list element that will not be examined. Iteration starts at
|
||||
* @skip->next.
|
||||
* @head: head of the list to examine. This list must have at least one
|
||||
* element, namely @skip.
|
||||
* @member: name of the list_head structure within typeof(*pos).
|
||||
*/
|
||||
#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
|
||||
for ((pos) = (skip); \
|
||||
(pos = (pos)->member.next != (head) ? list_entry_rcu( \
|
||||
(pos)->member.next, typeof(*pos), member) : \
|
||||
list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
|
||||
(pos) != (skip); )
|
||||
|
||||
/*
|
||||
* Called after a driver tag has been freed to check whether a hctx needs to
|
||||
* be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
|
||||
* queues in a round-robin fashion if the tag set of @hctx is shared with other
|
||||
* hardware queues.
|
||||
*/
|
||||
void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
|
||||
{
|
||||
struct blk_mq_tags *const tags = hctx->tags;
|
||||
struct blk_mq_tag_set *const set = hctx->queue->tag_set;
|
||||
struct request_queue *const queue = hctx->queue, *q;
|
||||
struct blk_mq_hw_ctx *hctx2;
|
||||
unsigned int i, j;
|
||||
|
||||
if (set->flags & BLK_MQ_F_TAG_SHARED) {
|
||||
/*
|
||||
* If this is 0, then we know that no hardware queues
|
||||
* have RESTART marked. We're done.
|
||||
*/
|
||||
if (!atomic_read(&queue->shared_hctx_restart))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
|
||||
tag_set_list) {
|
||||
queue_for_each_hw_ctx(q, hctx2, i)
|
||||
if (hctx2->tags == tags &&
|
||||
blk_mq_sched_restart_hctx(hctx2))
|
||||
goto done;
|
||||
}
|
||||
j = hctx->queue_num + 1;
|
||||
for (i = 0; i < queue->nr_hw_queues; i++, j++) {
|
||||
if (j == queue->nr_hw_queues)
|
||||
j = 0;
|
||||
hctx2 = queue->queue_hw_ctx[j];
|
||||
if (hctx2->tags == tags &&
|
||||
blk_mq_sched_restart_hctx(hctx2))
|
||||
break;
|
||||
}
|
||||
done:
|
||||
rcu_read_unlock();
|
||||
} else {
|
||||
blk_mq_sched_restart_hctx(hctx);
|
||||
}
|
||||
}
|
||||
|
||||
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
|
||||
bool run_queue, bool async)
|
||||
{
|
||||
@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
|
||||
|
||||
if (e && e->type->ops.mq.insert_requests)
|
||||
e->type->ops.mq.insert_requests(hctx, list, false);
|
||||
else
|
||||
else {
|
||||
/*
|
||||
* try to issue requests directly if the hw queue isn't
|
||||
* busy in case of 'none' scheduler, and this way may save
|
||||
* us one extra enqueue & dequeue to sw queue.
|
||||
*/
|
||||
if (!hctx->dispatch_busy && !e && !run_queue_async) {
|
||||
blk_mq_try_issue_list_directly(hctx, list);
|
||||
if (list_empty(list))
|
||||
return;
|
||||
}
|
||||
blk_mq_insert_requests(hctx, ctx, list);
|
||||
}
|
||||
|
||||
blk_mq_run_hw_queue(hctx, run_queue_async);
|
||||
}
|
||||
|
@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
|
||||
|
||||
/*
|
||||
* If a previously inactive queue goes active, bump the active user count.
|
||||
* We need to do this before try to allocate driver tag, then even if fail
|
||||
* to get tag when first time, the other shared-tag users could reserve
|
||||
* budget for it.
|
||||
*/
|
||||
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
if (tdepth <= tags->nr_reserved_tags)
|
||||
return -EINVAL;
|
||||
|
||||
tdepth -= tags->nr_reserved_tags;
|
||||
|
||||
/*
|
||||
* If we are allowed to grow beyond the original size, allocate
|
||||
* a new set of tags before freeing the old one.
|
||||
@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
if (tdepth > 16 * BLKDEV_MAX_RQ)
|
||||
return -EINVAL;
|
||||
|
||||
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
|
||||
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
|
||||
tags->nr_reserved_tags);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
|
||||
@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
* Don't need (or can't) update reserved tags here, they
|
||||
* remain static and should never need resizing.
|
||||
*/
|
||||
sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
|
||||
sbitmap_queue_resize(&tags->bitmap_tags,
|
||||
tdepth - tags->nr_reserved_tags);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
173
block/blk-mq.c
173
block/blk-mq.c
@ -34,8 +34,8 @@
|
||||
#include "blk-mq-debugfs.h"
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-stat.h"
|
||||
#include "blk-wbt.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
|
||||
static void blk_mq_poll_stats_start(struct request_queue *q);
|
||||
@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
|
||||
rq->tag = -1;
|
||||
rq->internal_tag = tag;
|
||||
} else {
|
||||
if (blk_mq_tag_busy(data->hctx)) {
|
||||
if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
|
||||
rq_flags = RQF_MQ_INFLIGHT;
|
||||
atomic_inc(&data->hctx->nr_active);
|
||||
}
|
||||
@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
|
||||
if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
|
||||
!(data->flags & BLK_MQ_REQ_RESERVED))
|
||||
e->type->ops.mq.limit_depth(op, data);
|
||||
} else {
|
||||
blk_mq_tag_busy(data->hctx);
|
||||
}
|
||||
|
||||
tag = blk_mq_get_tag(data);
|
||||
@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq)
|
||||
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
|
||||
laptop_io_completion(q->backing_dev_info);
|
||||
|
||||
wbt_done(q->rq_wb, rq);
|
||||
rq_qos_done(q, rq);
|
||||
|
||||
if (blk_rq_rl(rq))
|
||||
blk_put_rl(blk_rq_rl(rq));
|
||||
@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
|
||||
blk_account_io_done(rq, now);
|
||||
|
||||
if (rq->end_io) {
|
||||
wbt_done(rq->q->rq_wb, rq);
|
||||
rq_qos_done(rq->q, rq);
|
||||
rq->end_io(rq, error);
|
||||
} else {
|
||||
if (unlikely(blk_bidi_rq(rq)))
|
||||
@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
|
||||
rq->throtl_size = blk_rq_sectors(rq);
|
||||
#endif
|
||||
rq->rq_flags |= RQF_STATS;
|
||||
wbt_issue(q->rq_wb, rq);
|
||||
rq_qos_issue(q, rq);
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
|
||||
@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
|
||||
blk_mq_put_driver_tag(rq);
|
||||
|
||||
trace_block_rq_requeue(q, rq);
|
||||
wbt_requeue(q->rq_wb, rq);
|
||||
rq_qos_requeue(q, rq);
|
||||
|
||||
if (blk_mq_request_started(rq)) {
|
||||
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
|
||||
@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
|
||||
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
|
||||
}
|
||||
|
||||
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
|
||||
bool wait)
|
||||
bool blk_mq_get_driver_tag(struct request *rq)
|
||||
{
|
||||
struct blk_mq_alloc_data data = {
|
||||
.q = rq->q,
|
||||
.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
|
||||
.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
|
||||
.flags = BLK_MQ_REQ_NOWAIT,
|
||||
};
|
||||
|
||||
might_sleep_if(wait);
|
||||
bool shared;
|
||||
|
||||
if (rq->tag != -1)
|
||||
goto done;
|
||||
@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
|
||||
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
|
||||
data.flags |= BLK_MQ_REQ_RESERVED;
|
||||
|
||||
shared = blk_mq_tag_busy(data.hctx);
|
||||
rq->tag = blk_mq_get_tag(&data);
|
||||
if (rq->tag >= 0) {
|
||||
if (blk_mq_tag_busy(data.hctx)) {
|
||||
if (shared) {
|
||||
rq->rq_flags |= RQF_MQ_INFLIGHT;
|
||||
atomic_inc(&data.hctx->nr_active);
|
||||
}
|
||||
@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
|
||||
}
|
||||
|
||||
done:
|
||||
if (hctx)
|
||||
*hctx = data.hctx;
|
||||
return rq->tag != -1;
|
||||
}
|
||||
|
||||
@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
|
||||
|
||||
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
|
||||
|
||||
spin_lock(&hctx->dispatch_wait_lock);
|
||||
list_del_init(&wait->entry);
|
||||
spin_unlock(&hctx->dispatch_wait_lock);
|
||||
|
||||
blk_mq_run_hw_queue(hctx, true);
|
||||
return 1;
|
||||
}
|
||||
@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
|
||||
* restart. For both cases, take care to check the condition again after
|
||||
* marking us as waiting.
|
||||
*/
|
||||
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
|
||||
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
|
||||
struct request *rq)
|
||||
{
|
||||
struct blk_mq_hw_ctx *this_hctx = *hctx;
|
||||
struct sbq_wait_state *ws;
|
||||
struct wait_queue_head *wq;
|
||||
wait_queue_entry_t *wait;
|
||||
bool ret;
|
||||
|
||||
if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
|
||||
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
|
||||
set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
|
||||
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
|
||||
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
|
||||
|
||||
/*
|
||||
* It's possible that a tag was freed in the window between the
|
||||
@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
|
||||
* Don't clear RESTART here, someone else could have set it.
|
||||
* At most this will cost an extra queue run.
|
||||
*/
|
||||
return blk_mq_get_driver_tag(rq, hctx, false);
|
||||
return blk_mq_get_driver_tag(rq);
|
||||
}
|
||||
|
||||
wait = &this_hctx->dispatch_wait;
|
||||
wait = &hctx->dispatch_wait;
|
||||
if (!list_empty_careful(&wait->entry))
|
||||
return false;
|
||||
|
||||
spin_lock(&this_hctx->lock);
|
||||
wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
|
||||
|
||||
spin_lock_irq(&wq->lock);
|
||||
spin_lock(&hctx->dispatch_wait_lock);
|
||||
if (!list_empty(&wait->entry)) {
|
||||
spin_unlock(&this_hctx->lock);
|
||||
spin_unlock(&hctx->dispatch_wait_lock);
|
||||
spin_unlock_irq(&wq->lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
|
||||
add_wait_queue(&ws->wait, wait);
|
||||
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
|
||||
__add_wait_queue(wq, wait);
|
||||
|
||||
/*
|
||||
* It's possible that a tag was freed in the window between the
|
||||
* allocation failure and adding the hardware queue to the wait
|
||||
* queue.
|
||||
*/
|
||||
ret = blk_mq_get_driver_tag(rq, hctx, false);
|
||||
ret = blk_mq_get_driver_tag(rq);
|
||||
if (!ret) {
|
||||
spin_unlock(&this_hctx->lock);
|
||||
spin_unlock(&hctx->dispatch_wait_lock);
|
||||
spin_unlock_irq(&wq->lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
|
||||
* We got a tag, remove ourselves from the wait queue to ensure
|
||||
* someone else gets the wakeup.
|
||||
*/
|
||||
spin_lock_irq(&ws->wait.lock);
|
||||
list_del_init(&wait->entry);
|
||||
spin_unlock_irq(&ws->wait.lock);
|
||||
spin_unlock(&this_hctx->lock);
|
||||
spin_unlock(&hctx->dispatch_wait_lock);
|
||||
spin_unlock_irq(&wq->lock);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
|
||||
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
|
||||
/*
|
||||
* Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
|
||||
* - EWMA is one simple way to compute running average value
|
||||
* - weight(7/8 and 1/8) is applied so that it can decrease exponentially
|
||||
* - take 4 as factor for avoiding to get too small(0) result, and this
|
||||
* factor doesn't matter because EWMA decreases exponentially
|
||||
*/
|
||||
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
|
||||
{
|
||||
unsigned int ewma;
|
||||
|
||||
if (hctx->queue->elevator)
|
||||
return;
|
||||
|
||||
ewma = hctx->dispatch_busy;
|
||||
|
||||
if (!ewma && !busy)
|
||||
return;
|
||||
|
||||
ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
|
||||
if (busy)
|
||||
ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
|
||||
ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
|
||||
|
||||
hctx->dispatch_busy = ewma;
|
||||
}
|
||||
|
||||
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
|
||||
|
||||
/*
|
||||
@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
|
||||
if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
|
||||
break;
|
||||
|
||||
if (!blk_mq_get_driver_tag(rq, NULL, false)) {
|
||||
if (!blk_mq_get_driver_tag(rq)) {
|
||||
/*
|
||||
* The initial allocation attempt failed, so we need to
|
||||
* rerun the hardware queue when a tag is freed. The
|
||||
@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
|
||||
* before we add this entry back on the dispatch list,
|
||||
* we'll re-run it below.
|
||||
*/
|
||||
if (!blk_mq_mark_tag_wait(&hctx, rq)) {
|
||||
if (!blk_mq_mark_tag_wait(hctx, rq)) {
|
||||
blk_mq_put_dispatch_budget(hctx);
|
||||
/*
|
||||
* For non-shared tags, the RESTART check
|
||||
@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
|
||||
bd.last = true;
|
||||
else {
|
||||
nxt = list_first_entry(list, struct request, queuelist);
|
||||
bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
|
||||
bd.last = !blk_mq_get_driver_tag(nxt);
|
||||
}
|
||||
|
||||
ret = q->mq_ops->queue_rq(hctx, &bd);
|
||||
@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
|
||||
else if (needs_restart && (ret == BLK_STS_RESOURCE))
|
||||
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
|
||||
|
||||
blk_mq_update_dispatch_busy(hctx, true);
|
||||
return false;
|
||||
}
|
||||
} else
|
||||
blk_mq_update_dispatch_busy(hctx, false);
|
||||
|
||||
/*
|
||||
* If the host/device is unable to accept more work, inform the
|
||||
@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
|
||||
struct list_head *list)
|
||||
|
||||
{
|
||||
struct request *rq;
|
||||
|
||||
/*
|
||||
* preemption doesn't flush plug list, so it's possible ctx->cpu is
|
||||
* offline now
|
||||
*/
|
||||
spin_lock(&ctx->lock);
|
||||
while (!list_empty(list)) {
|
||||
struct request *rq;
|
||||
|
||||
rq = list_first_entry(list, struct request, queuelist);
|
||||
list_for_each_entry(rq, list, queuelist) {
|
||||
BUG_ON(rq->mq_ctx != ctx);
|
||||
list_del_init(&rq->queuelist);
|
||||
__blk_mq_insert_req_list(hctx, rq, false);
|
||||
trace_block_rq_insert(hctx->queue, rq);
|
||||
}
|
||||
|
||||
spin_lock(&ctx->lock);
|
||||
list_splice_tail_init(list, &ctx->rq_list);
|
||||
blk_mq_hctx_mark_pending(hctx, ctx);
|
||||
spin_unlock(&ctx->lock);
|
||||
}
|
||||
@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
|
||||
ret = q->mq_ops->queue_rq(hctx, &bd);
|
||||
switch (ret) {
|
||||
case BLK_STS_OK:
|
||||
blk_mq_update_dispatch_busy(hctx, false);
|
||||
*cookie = new_cookie;
|
||||
break;
|
||||
case BLK_STS_RESOURCE:
|
||||
case BLK_STS_DEV_RESOURCE:
|
||||
blk_mq_update_dispatch_busy(hctx, true);
|
||||
__blk_mq_requeue_request(rq);
|
||||
break;
|
||||
default:
|
||||
blk_mq_update_dispatch_busy(hctx, false);
|
||||
*cookie = BLK_QC_T_NONE;
|
||||
break;
|
||||
}
|
||||
@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
|
||||
if (!blk_mq_get_dispatch_budget(hctx))
|
||||
goto insert;
|
||||
|
||||
if (!blk_mq_get_driver_tag(rq, NULL, false)) {
|
||||
if (!blk_mq_get_driver_tag(rq)) {
|
||||
blk_mq_put_dispatch_budget(hctx);
|
||||
goto insert;
|
||||
}
|
||||
@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
|
||||
return ret;
|
||||
}
|
||||
|
||||
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
|
||||
struct list_head *list)
|
||||
{
|
||||
while (!list_empty(list)) {
|
||||
blk_status_t ret;
|
||||
struct request *rq = list_first_entry(list, struct request,
|
||||
queuelist);
|
||||
|
||||
list_del_init(&rq->queuelist);
|
||||
ret = blk_mq_request_issue_directly(rq);
|
||||
if (ret != BLK_STS_OK) {
|
||||
if (ret == BLK_STS_RESOURCE ||
|
||||
ret == BLK_STS_DEV_RESOURCE) {
|
||||
list_add(&rq->queuelist, list);
|
||||
break;
|
||||
}
|
||||
blk_mq_end_request(rq, ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
const int is_sync = op_is_sync(bio->bi_opf);
|
||||
@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
||||
struct blk_plug *plug;
|
||||
struct request *same_queue_rq = NULL;
|
||||
blk_qc_t cookie;
|
||||
unsigned int wb_acct;
|
||||
|
||||
blk_queue_bounce(q, &bio);
|
||||
|
||||
@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
||||
if (blk_mq_sched_bio_merge(q, bio))
|
||||
return BLK_QC_T_NONE;
|
||||
|
||||
wb_acct = wbt_wait(q->rq_wb, bio, NULL);
|
||||
rq_qos_throttle(q, bio, NULL);
|
||||
|
||||
trace_block_getrq(q, bio, bio->bi_opf);
|
||||
|
||||
rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
|
||||
if (unlikely(!rq)) {
|
||||
__wbt_done(q->rq_wb, wb_acct);
|
||||
rq_qos_cleanup(q, bio);
|
||||
if (bio->bi_opf & REQ_NOWAIT)
|
||||
bio_wouldblock_error(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
wbt_track(rq, wb_acct);
|
||||
rq_qos_track(q, rq, bio);
|
||||
|
||||
cookie = request_to_qc_t(data.hctx, rq);
|
||||
|
||||
@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
|
||||
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
|
||||
&cookie);
|
||||
}
|
||||
} else if (q->nr_hw_queues > 1 && is_sync) {
|
||||
} else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
|
||||
!data.hctx->dispatch_busy)) {
|
||||
blk_mq_put_ctx(data.ctx);
|
||||
blk_mq_bio_to_request(rq, bio);
|
||||
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
|
||||
@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
|
||||
|
||||
hctx->nr_ctx = 0;
|
||||
|
||||
spin_lock_init(&hctx->dispatch_wait_lock);
|
||||
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
|
||||
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
|
||||
|
||||
@ -2331,16 +2392,11 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
|
||||
int i;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (shared) {
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
atomic_inc(&q->shared_hctx_restart);
|
||||
if (shared)
|
||||
hctx->flags |= BLK_MQ_F_TAG_SHARED;
|
||||
} else {
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
atomic_dec(&q->shared_hctx_restart);
|
||||
else
|
||||
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
|
||||
@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
|
||||
blk_mq_update_tag_set_depth(set, false);
|
||||
}
|
||||
mutex_unlock(&set->tag_list_lock);
|
||||
synchronize_rcu();
|
||||
INIT_LIST_HEAD(&q->tag_set_list);
|
||||
}
|
||||
|
||||
@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
|
||||
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
|
||||
{
|
||||
if (set->ops->map_queues) {
|
||||
int cpu;
|
||||
/*
|
||||
* transport .map_queues is usually done in the following
|
||||
* way:
|
||||
@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
|
||||
* killing stale mapping since one CPU may not be mapped
|
||||
* to any hw queue.
|
||||
*/
|
||||
for_each_possible_cpu(cpu)
|
||||
set->mq_map[cpu] = 0;
|
||||
blk_mq_clear_mq_map(set);
|
||||
|
||||
return set->ops->map_queues(set);
|
||||
} else
|
||||
@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
|
||||
/*
|
||||
* Alloc a tag set to be associated with one or more request queues.
|
||||
* May fail with EINVAL for various error conditions. May adjust the
|
||||
* requested depth down, if if it too large. In that case, the set
|
||||
* requested depth down, if it's too large. In that case, the set
|
||||
* value will be stored in set->queue_depth.
|
||||
*/
|
||||
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
|
||||
|
@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
|
||||
void blk_mq_wake_waiters(struct request_queue *q);
|
||||
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
|
||||
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
|
||||
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
|
||||
bool wait);
|
||||
bool blk_mq_get_driver_tag(struct request *rq);
|
||||
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
|
||||
struct blk_mq_ctx *start);
|
||||
|
||||
@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
|
||||
|
||||
/* Used by blk_insert_cloned_request() to issue request directly */
|
||||
blk_status_t blk_mq_request_issue_directly(struct request *rq);
|
||||
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
|
||||
struct list_head *list);
|
||||
|
||||
/*
|
||||
* CPU -> queue mappings
|
||||
@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
|
||||
__blk_mq_put_driver_tag(hctx, rq);
|
||||
}
|
||||
|
||||
static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
set->mq_map[cpu] = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
194
block/blk-rq-qos.c
Normal file
194
block/blk-rq-qos.c
Normal file
@ -0,0 +1,194 @@
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
/*
|
||||
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
|
||||
* false if 'v' + 1 would be bigger than 'below'.
|
||||
*/
|
||||
static bool atomic_inc_below(atomic_t *v, unsigned int below)
|
||||
{
|
||||
unsigned int cur = atomic_read(v);
|
||||
|
||||
for (;;) {
|
||||
unsigned int old;
|
||||
|
||||
if (cur >= below)
|
||||
return false;
|
||||
old = atomic_cmpxchg(v, cur, cur + 1);
|
||||
if (old == cur)
|
||||
break;
|
||||
cur = old;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
|
||||
{
|
||||
return atomic_inc_below(&rq_wait->inflight, limit);
|
||||
}
|
||||
|
||||
void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->cleanup)
|
||||
rqos->ops->cleanup(rqos, bio);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_done(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->done)
|
||||
rqos->ops->done(rqos, rq);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_issue(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->issue)
|
||||
rqos->ops->issue(rqos, rq);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_requeue(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->requeue)
|
||||
rqos->ops->requeue(rqos, rq);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_throttle(struct request_queue *q, struct bio *bio,
|
||||
spinlock_t *lock)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->throttle)
|
||||
rqos->ops->throttle(rqos, bio, lock);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->track)
|
||||
rqos->ops->track(rqos, rq, bio);
|
||||
}
|
||||
}
|
||||
|
||||
void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
|
||||
for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->ops->done_bio)
|
||||
rqos->ops->done_bio(rqos, bio);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true, if we can't increase the depth further by scaling
|
||||
*/
|
||||
bool rq_depth_calc_max_depth(struct rq_depth *rqd)
|
||||
{
|
||||
unsigned int depth;
|
||||
bool ret = false;
|
||||
|
||||
/*
|
||||
* For QD=1 devices, this is a special case. It's important for those
|
||||
* to have one request ready when one completes, so force a depth of
|
||||
* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
|
||||
* since the device can't have more than that in flight. If we're
|
||||
* scaling down, then keep a setting of 1/1/1.
|
||||
*/
|
||||
if (rqd->queue_depth == 1) {
|
||||
if (rqd->scale_step > 0)
|
||||
rqd->max_depth = 1;
|
||||
else {
|
||||
rqd->max_depth = 2;
|
||||
ret = true;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* scale_step == 0 is our default state. If we have suffered
|
||||
* latency spikes, step will be > 0, and we shrink the
|
||||
* allowed write depths. If step is < 0, we're only doing
|
||||
* writes, and we allow a temporarily higher depth to
|
||||
* increase performance.
|
||||
*/
|
||||
depth = min_t(unsigned int, rqd->default_depth,
|
||||
rqd->queue_depth);
|
||||
if (rqd->scale_step > 0)
|
||||
depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
|
||||
else if (rqd->scale_step < 0) {
|
||||
unsigned int maxd = 3 * rqd->queue_depth / 4;
|
||||
|
||||
depth = 1 + ((depth - 1) << -rqd->scale_step);
|
||||
if (depth > maxd) {
|
||||
depth = maxd;
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
|
||||
rqd->max_depth = depth;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void rq_depth_scale_up(struct rq_depth *rqd)
|
||||
{
|
||||
/*
|
||||
* Hit max in previous round, stop here
|
||||
*/
|
||||
if (rqd->scaled_max)
|
||||
return;
|
||||
|
||||
rqd->scale_step--;
|
||||
|
||||
rqd->scaled_max = rq_depth_calc_max_depth(rqd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
|
||||
* had a latency violation.
|
||||
*/
|
||||
void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
|
||||
{
|
||||
/*
|
||||
* Stop scaling down when we've hit the limit. This also prevents
|
||||
* ->scale_step from going to crazy values, if the device can't
|
||||
* keep up.
|
||||
*/
|
||||
if (rqd->max_depth == 1)
|
||||
return;
|
||||
|
||||
if (rqd->scale_step < 0 && hard_throttle)
|
||||
rqd->scale_step = 0;
|
||||
else
|
||||
rqd->scale_step++;
|
||||
|
||||
rqd->scaled_max = false;
|
||||
rq_depth_calc_max_depth(rqd);
|
||||
}
|
||||
|
||||
void rq_qos_exit(struct request_queue *q)
|
||||
{
|
||||
while (q->rq_qos) {
|
||||
struct rq_qos *rqos = q->rq_qos;
|
||||
q->rq_qos = rqos->next;
|
||||
rqos->ops->exit(rqos);
|
||||
}
|
||||
}
|
109
block/blk-rq-qos.h
Normal file
109
block/blk-rq-qos.h
Normal file
@ -0,0 +1,109 @@
|
||||
#ifndef RQ_QOS_H
|
||||
#define RQ_QOS_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk_types.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
enum rq_qos_id {
|
||||
RQ_QOS_WBT,
|
||||
RQ_QOS_CGROUP,
|
||||
};
|
||||
|
||||
struct rq_wait {
|
||||
wait_queue_head_t wait;
|
||||
atomic_t inflight;
|
||||
};
|
||||
|
||||
struct rq_qos {
|
||||
struct rq_qos_ops *ops;
|
||||
struct request_queue *q;
|
||||
enum rq_qos_id id;
|
||||
struct rq_qos *next;
|
||||
};
|
||||
|
||||
struct rq_qos_ops {
|
||||
void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
|
||||
void (*track)(struct rq_qos *, struct request *, struct bio *);
|
||||
void (*issue)(struct rq_qos *, struct request *);
|
||||
void (*requeue)(struct rq_qos *, struct request *);
|
||||
void (*done)(struct rq_qos *, struct request *);
|
||||
void (*done_bio)(struct rq_qos *, struct bio *);
|
||||
void (*cleanup)(struct rq_qos *, struct bio *);
|
||||
void (*exit)(struct rq_qos *);
|
||||
};
|
||||
|
||||
struct rq_depth {
|
||||
unsigned int max_depth;
|
||||
|
||||
int scale_step;
|
||||
bool scaled_max;
|
||||
|
||||
unsigned int queue_depth;
|
||||
unsigned int default_depth;
|
||||
};
|
||||
|
||||
static inline struct rq_qos *rq_qos_id(struct request_queue *q,
|
||||
enum rq_qos_id id)
|
||||
{
|
||||
struct rq_qos *rqos;
|
||||
for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
||||
if (rqos->id == id)
|
||||
break;
|
||||
}
|
||||
return rqos;
|
||||
}
|
||||
|
||||
static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
|
||||
{
|
||||
return rq_qos_id(q, RQ_QOS_WBT);
|
||||
}
|
||||
|
||||
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
|
||||
{
|
||||
return rq_qos_id(q, RQ_QOS_CGROUP);
|
||||
}
|
||||
|
||||
static inline void rq_wait_init(struct rq_wait *rq_wait)
|
||||
{
|
||||
atomic_set(&rq_wait->inflight, 0);
|
||||
init_waitqueue_head(&rq_wait->wait);
|
||||
}
|
||||
|
||||
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
|
||||
{
|
||||
rqos->next = q->rq_qos;
|
||||
q->rq_qos = rqos;
|
||||
}
|
||||
|
||||
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
|
||||
{
|
||||
struct rq_qos *cur, *prev = NULL;
|
||||
for (cur = q->rq_qos; cur; cur = cur->next) {
|
||||
if (cur == rqos) {
|
||||
if (prev)
|
||||
prev->next = rqos->next;
|
||||
else
|
||||
q->rq_qos = cur;
|
||||
break;
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
|
||||
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
|
||||
void rq_depth_scale_up(struct rq_depth *rqd);
|
||||
void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
|
||||
bool rq_depth_calc_max_depth(struct rq_depth *rqd);
|
||||
|
||||
void rq_qos_cleanup(struct request_queue *, struct bio *);
|
||||
void rq_qos_done(struct request_queue *, struct request *);
|
||||
void rq_qos_issue(struct request_queue *, struct request *);
|
||||
void rq_qos_requeue(struct request_queue *, struct request *);
|
||||
void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
|
||||
void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
|
||||
void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
|
||||
void rq_qos_exit(struct request_queue *);
|
||||
#endif
|
@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
|
||||
|
||||
/* Inherit limits from component devices */
|
||||
lim->max_segments = USHRT_MAX;
|
||||
lim->max_discard_segments = 1;
|
||||
lim->max_discard_segments = USHRT_MAX;
|
||||
lim->max_hw_sectors = UINT_MAX;
|
||||
lim->max_segment_size = UINT_MAX;
|
||||
lim->max_sectors = UINT_MAX;
|
||||
@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
|
||||
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
q->queue_depth = depth;
|
||||
wbt_set_queue_depth(q->rq_wb, depth);
|
||||
wbt_set_queue_depth(q, depth);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_set_queue_depth);
|
||||
|
||||
@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
|
||||
queue_flag_clear(QUEUE_FLAG_FUA, q);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
|
||||
wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
|
||||
|
||||
|
@ -17,7 +17,7 @@ struct blk_queue_stats {
|
||||
bool enable_accounting;
|
||||
};
|
||||
|
||||
static void blk_stat_init(struct blk_rq_stat *stat)
|
||||
void blk_rq_stat_init(struct blk_rq_stat *stat)
|
||||
{
|
||||
stat->min = -1ULL;
|
||||
stat->max = stat->nr_samples = stat->mean = 0;
|
||||
@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat)
|
||||
}
|
||||
|
||||
/* src is a per-cpu stat, mean isn't initialized */
|
||||
static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
|
||||
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
|
||||
{
|
||||
if (!src->nr_samples)
|
||||
return;
|
||||
@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
|
||||
dst->nr_samples += src->nr_samples;
|
||||
}
|
||||
|
||||
static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
|
||||
void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
|
||||
{
|
||||
stat->min = min(stat->min, value);
|
||||
stat->max = max(stat->max, value);
|
||||
@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now)
|
||||
continue;
|
||||
|
||||
stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
|
||||
__blk_stat_add(stat, value);
|
||||
blk_rq_stat_add(stat, value);
|
||||
put_cpu_ptr(cb->cpu_stat);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t)
|
||||
int cpu;
|
||||
|
||||
for (bucket = 0; bucket < cb->buckets; bucket++)
|
||||
blk_stat_init(&cb->stat[bucket]);
|
||||
blk_rq_stat_init(&cb->stat[bucket]);
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct blk_rq_stat *cpu_stat;
|
||||
|
||||
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
|
||||
for (bucket = 0; bucket < cb->buckets; bucket++) {
|
||||
blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
|
||||
blk_stat_init(&cpu_stat[bucket]);
|
||||
blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
|
||||
blk_rq_stat_init(&cpu_stat[bucket]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q,
|
||||
|
||||
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
|
||||
for (bucket = 0; bucket < cb->buckets; bucket++)
|
||||
blk_stat_init(&cpu_stat[bucket]);
|
||||
blk_rq_stat_init(&cpu_stat[bucket]);
|
||||
}
|
||||
|
||||
spin_lock(&q->stats->lock);
|
||||
|
@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
|
||||
mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
|
||||
}
|
||||
|
||||
void blk_rq_stat_add(struct blk_rq_stat *, u64);
|
||||
void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
|
||||
void blk_rq_stat_init(struct blk_rq_stat *);
|
||||
|
||||
#endif
|
||||
|
@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
|
||||
|
||||
static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
|
||||
{
|
||||
if (!q->rq_wb)
|
||||
if (!wbt_rq_qos(q))
|
||||
return -EINVAL;
|
||||
|
||||
return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
|
||||
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
|
||||
}
|
||||
|
||||
static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct rq_wb *rwb;
|
||||
struct rq_qos *rqos;
|
||||
ssize_t ret;
|
||||
s64 val;
|
||||
|
||||
@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
|
||||
if (val < -1)
|
||||
return -EINVAL;
|
||||
|
||||
rwb = q->rq_wb;
|
||||
if (!rwb) {
|
||||
rqos = wbt_rq_qos(q);
|
||||
if (!rqos) {
|
||||
ret = wbt_init(q);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
rwb = q->rq_wb;
|
||||
if (val == -1)
|
||||
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
|
||||
val = wbt_default_latency_nsec(q);
|
||||
else if (val >= 0)
|
||||
rwb->min_lat_nsec = val * 1000ULL;
|
||||
val *= 1000ULL;
|
||||
|
||||
if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
|
||||
rwb->enable_state = WBT_STATE_ON_MANUAL;
|
||||
wbt_set_min_lat(q, val);
|
||||
|
||||
wbt_update_limits(rwb);
|
||||
wbt_update_limits(q);
|
||||
return count;
|
||||
}
|
||||
|
||||
@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work)
|
||||
blk_stat_remove_callback(q, q->poll_cb);
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
|
||||
if (!blk_queue_dead(q)) {
|
||||
/*
|
||||
* Last reference was dropped without having called
|
||||
* blk_cleanup_queue().
|
||||
*/
|
||||
WARN_ONCE(blk_queue_init_done(q),
|
||||
"request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n",
|
||||
q);
|
||||
blk_exit_queue(q);
|
||||
}
|
||||
|
||||
WARN(blk_queue_root_blkg(q),
|
||||
"request queue %p is being released but it has not yet been removed from the blkcg controller\n",
|
||||
q);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
|
||||
blk_exit_rl(q, &q->root_rl);
|
||||
@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk)
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(disk_to_dev(disk));
|
||||
|
||||
wbt_exit(q);
|
||||
rq_qos_exit(q);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
if (q->request_fn || (q->mq_ops && q->elevator))
|
||||
|
@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
|
||||
struct throtl_grp *tg = blkg_to_tg(blkg);
|
||||
|
||||
if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
|
||||
tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
|
||||
tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
|
||||
low_valid = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
}
|
||||
|
||||
/* Calc approx time to dispatch */
|
||||
jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
|
||||
|
||||
if (jiffy_wait > jiffy_elapsed)
|
||||
jiffy_wait = jiffy_wait - jiffy_elapsed;
|
||||
else
|
||||
jiffy_wait = 1;
|
||||
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
|
||||
|
||||
if (wait)
|
||||
*wait = jiffy_wait;
|
||||
@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
|
||||
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
if (bio->bi_css) {
|
||||
if (bio->bi_cg_private)
|
||||
blkg_put(tg_to_blkg(bio->bi_cg_private));
|
||||
bio->bi_cg_private = tg;
|
||||
blkg_get(tg_to_blkg(tg));
|
||||
}
|
||||
if (bio->bi_css)
|
||||
bio_associate_blkg(bio, tg_to_blkg(tg));
|
||||
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
|
||||
#endif
|
||||
}
|
||||
@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
|
||||
|
||||
void blk_throtl_bio_endio(struct bio *bio)
|
||||
{
|
||||
struct blkcg_gq *blkg;
|
||||
struct throtl_grp *tg;
|
||||
u64 finish_time_ns;
|
||||
unsigned long finish_time;
|
||||
@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio)
|
||||
unsigned long lat;
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
tg = bio->bi_cg_private;
|
||||
if (!tg)
|
||||
blkg = bio->bi_blkg;
|
||||
if (!blkg)
|
||||
return;
|
||||
bio->bi_cg_private = NULL;
|
||||
tg = blkg_to_tg(blkg);
|
||||
|
||||
finish_time_ns = ktime_get_ns();
|
||||
tg->last_finish_time = finish_time_ns >> 10;
|
||||
|
||||
start_time = bio_issue_time(&bio->bi_issue) >> 10;
|
||||
finish_time = __bio_issue_time(finish_time_ns) >> 10;
|
||||
if (!start_time || finish_time <= start_time) {
|
||||
blkg_put(tg_to_blkg(tg));
|
||||
if (!start_time || finish_time <= start_time)
|
||||
return;
|
||||
}
|
||||
|
||||
lat = finish_time - start_time;
|
||||
/* this is only for bio based driver */
|
||||
@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio)
|
||||
tg->bio_cnt /= 2;
|
||||
tg->bad_bio_cnt /= 2;
|
||||
}
|
||||
|
||||
blkg_put(tg_to_blkg(tg));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
427
block/blk-wbt.c
427
block/blk-wbt.c
@ -25,6 +25,7 @@
|
||||
#include <linux/swap.h>
|
||||
|
||||
#include "blk-wbt.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/wbt.h>
|
||||
@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
|
||||
return rwb && rwb->wb_normal != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
|
||||
* false if 'v' + 1 would be bigger than 'below'.
|
||||
*/
|
||||
static bool atomic_inc_below(atomic_t *v, int below)
|
||||
{
|
||||
int cur = atomic_read(v);
|
||||
|
||||
for (;;) {
|
||||
int old;
|
||||
|
||||
if (cur >= below)
|
||||
return false;
|
||||
old = atomic_cmpxchg(v, cur, cur + 1);
|
||||
if (old == cur)
|
||||
break;
|
||||
cur = old;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
|
||||
{
|
||||
if (rwb_enabled(rwb)) {
|
||||
@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
|
||||
*/
|
||||
static bool wb_recent_wait(struct rq_wb *rwb)
|
||||
{
|
||||
struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
|
||||
struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
|
||||
|
||||
return time_before(jiffies, wb->dirty_sleep + HZ);
|
||||
}
|
||||
@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
|
||||
}
|
||||
}
|
||||
|
||||
void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
|
||||
static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
struct rq_wait *rqw;
|
||||
int inflight, limit;
|
||||
|
||||
@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
|
||||
int diff = limit - inflight;
|
||||
|
||||
if (!inflight || diff >= rwb->wb_background / 2)
|
||||
wake_up_all(&rqw->wait);
|
||||
wake_up(&rqw->wait);
|
||||
}
|
||||
}
|
||||
|
||||
@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
|
||||
* Called on completion of a request. Note that it's also called when
|
||||
* a request is merged, when the request gets freed.
|
||||
*/
|
||||
void wbt_done(struct rq_wb *rwb, struct request *rq)
|
||||
static void wbt_done(struct rq_qos *rqos, struct request *rq)
|
||||
{
|
||||
if (!rwb)
|
||||
return;
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
|
||||
if (!wbt_is_tracked(rq)) {
|
||||
if (rwb->sync_cookie == rq) {
|
||||
@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
|
||||
wb_timestamp(rwb, &rwb->last_comp);
|
||||
} else {
|
||||
WARN_ON_ONCE(rq == rwb->sync_cookie);
|
||||
__wbt_done(rwb, wbt_flags(rq));
|
||||
__wbt_done(rqos, wbt_flags(rq));
|
||||
}
|
||||
wbt_clear_state(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true, if we can't increase the depth further by scaling
|
||||
*/
|
||||
static bool calc_wb_limits(struct rq_wb *rwb)
|
||||
{
|
||||
unsigned int depth;
|
||||
bool ret = false;
|
||||
|
||||
if (!rwb->min_lat_nsec) {
|
||||
rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* For QD=1 devices, this is a special case. It's important for those
|
||||
* to have one request ready when one completes, so force a depth of
|
||||
* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
|
||||
* since the device can't have more than that in flight. If we're
|
||||
* scaling down, then keep a setting of 1/1/1.
|
||||
*/
|
||||
if (rwb->queue_depth == 1) {
|
||||
if (rwb->scale_step > 0)
|
||||
rwb->wb_max = rwb->wb_normal = 1;
|
||||
else {
|
||||
rwb->wb_max = rwb->wb_normal = 2;
|
||||
ret = true;
|
||||
}
|
||||
rwb->wb_background = 1;
|
||||
} else {
|
||||
/*
|
||||
* scale_step == 0 is our default state. If we have suffered
|
||||
* latency spikes, step will be > 0, and we shrink the
|
||||
* allowed write depths. If step is < 0, we're only doing
|
||||
* writes, and we allow a temporarily higher depth to
|
||||
* increase performance.
|
||||
*/
|
||||
depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
|
||||
if (rwb->scale_step > 0)
|
||||
depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
|
||||
else if (rwb->scale_step < 0) {
|
||||
unsigned int maxd = 3 * rwb->queue_depth / 4;
|
||||
|
||||
depth = 1 + ((depth - 1) << -rwb->scale_step);
|
||||
if (depth > maxd) {
|
||||
depth = maxd;
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set our max/normal/bg queue depths based on how far
|
||||
* we have scaled down (->scale_step).
|
||||
*/
|
||||
rwb->wb_max = depth;
|
||||
rwb->wb_normal = (rwb->wb_max + 1) / 2;
|
||||
rwb->wb_background = (rwb->wb_max + 3) / 4;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool stat_sample_valid(struct blk_rq_stat *stat)
|
||||
{
|
||||
/*
|
||||
@ -307,7 +225,8 @@ enum {
|
||||
|
||||
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
|
||||
{
|
||||
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
|
||||
struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
|
||||
struct rq_depth *rqd = &rwb->rq_depth;
|
||||
u64 thislat;
|
||||
|
||||
/*
|
||||
@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
|
||||
return LAT_EXCEEDED;
|
||||
}
|
||||
|
||||
if (rwb->scale_step)
|
||||
if (rqd->scale_step)
|
||||
trace_wbt_stat(bdi, stat);
|
||||
|
||||
return LAT_OK;
|
||||
@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
|
||||
|
||||
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
|
||||
{
|
||||
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
|
||||
struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
|
||||
struct rq_depth *rqd = &rwb->rq_depth;
|
||||
|
||||
trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
|
||||
rwb->wb_background, rwb->wb_normal, rwb->wb_max);
|
||||
trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
|
||||
rwb->wb_background, rwb->wb_normal, rqd->max_depth);
|
||||
}
|
||||
|
||||
static void calc_wb_limits(struct rq_wb *rwb)
|
||||
{
|
||||
if (rwb->min_lat_nsec == 0) {
|
||||
rwb->wb_normal = rwb->wb_background = 0;
|
||||
} else if (rwb->rq_depth.max_depth <= 2) {
|
||||
rwb->wb_normal = rwb->rq_depth.max_depth;
|
||||
rwb->wb_background = 1;
|
||||
} else {
|
||||
rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
|
||||
rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
|
||||
}
|
||||
}
|
||||
|
||||
static void scale_up(struct rq_wb *rwb)
|
||||
{
|
||||
/*
|
||||
* Hit max in previous round, stop here
|
||||
*/
|
||||
if (rwb->scaled_max)
|
||||
return;
|
||||
|
||||
rwb->scale_step--;
|
||||
rq_depth_scale_up(&rwb->rq_depth);
|
||||
calc_wb_limits(rwb);
|
||||
rwb->unknown_cnt = 0;
|
||||
|
||||
rwb->scaled_max = calc_wb_limits(rwb);
|
||||
|
||||
rwb_wake_all(rwb);
|
||||
|
||||
rwb_trace_step(rwb, "step up");
|
||||
rwb_trace_step(rwb, "scale up");
|
||||
}
|
||||
|
||||
/*
|
||||
* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
|
||||
* had a latency violation.
|
||||
*/
|
||||
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
|
||||
{
|
||||
/*
|
||||
* Stop scaling down when we've hit the limit. This also prevents
|
||||
* ->scale_step from going to crazy values, if the device can't
|
||||
* keep up.
|
||||
*/
|
||||
if (rwb->wb_max == 1)
|
||||
return;
|
||||
|
||||
if (rwb->scale_step < 0 && hard_throttle)
|
||||
rwb->scale_step = 0;
|
||||
else
|
||||
rwb->scale_step++;
|
||||
|
||||
rwb->scaled_max = false;
|
||||
rwb->unknown_cnt = 0;
|
||||
rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
|
||||
calc_wb_limits(rwb);
|
||||
rwb_trace_step(rwb, "step down");
|
||||
rwb->unknown_cnt = 0;
|
||||
rwb_wake_all(rwb);
|
||||
rwb_trace_step(rwb, "scale down");
|
||||
}
|
||||
|
||||
static void rwb_arm_timer(struct rq_wb *rwb)
|
||||
{
|
||||
if (rwb->scale_step > 0) {
|
||||
struct rq_depth *rqd = &rwb->rq_depth;
|
||||
|
||||
if (rqd->scale_step > 0) {
|
||||
/*
|
||||
* We should speed this up, using some variant of a fast
|
||||
* integer inverse square root calculation. Since we only do
|
||||
@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
|
||||
* though.
|
||||
*/
|
||||
rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
|
||||
int_sqrt((rwb->scale_step + 1) << 8));
|
||||
int_sqrt((rqd->scale_step + 1) << 8));
|
||||
} else {
|
||||
/*
|
||||
* For step < 0, we don't want to increase/decrease the
|
||||
@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
|
||||
static void wb_timer_fn(struct blk_stat_callback *cb)
|
||||
{
|
||||
struct rq_wb *rwb = cb->data;
|
||||
struct rq_depth *rqd = &rwb->rq_depth;
|
||||
unsigned int inflight = wbt_inflight(rwb);
|
||||
int status;
|
||||
|
||||
status = latency_exceeded(rwb, cb->stat);
|
||||
|
||||
trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
|
||||
trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
|
||||
inflight);
|
||||
|
||||
/*
|
||||
@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
|
||||
* currently don't have a valid read/write sample. For that
|
||||
* case, slowly return to center state (step == 0).
|
||||
*/
|
||||
if (rwb->scale_step > 0)
|
||||
if (rqd->scale_step > 0)
|
||||
scale_up(rwb);
|
||||
else if (rwb->scale_step < 0)
|
||||
else if (rqd->scale_step < 0)
|
||||
scale_down(rwb, false);
|
||||
break;
|
||||
default:
|
||||
@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
|
||||
/*
|
||||
* Re-arm timer, if we have IO in flight
|
||||
*/
|
||||
if (rwb->scale_step || inflight)
|
||||
if (rqd->scale_step || inflight)
|
||||
rwb_arm_timer(rwb);
|
||||
}
|
||||
|
||||
void wbt_update_limits(struct rq_wb *rwb)
|
||||
static void __wbt_update_limits(struct rq_wb *rwb)
|
||||
{
|
||||
rwb->scale_step = 0;
|
||||
rwb->scaled_max = false;
|
||||
struct rq_depth *rqd = &rwb->rq_depth;
|
||||
|
||||
rqd->scale_step = 0;
|
||||
rqd->scaled_max = false;
|
||||
|
||||
rq_depth_calc_max_depth(rqd);
|
||||
calc_wb_limits(rwb);
|
||||
|
||||
rwb_wake_all(rwb);
|
||||
}
|
||||
|
||||
void wbt_update_limits(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (!rqos)
|
||||
return;
|
||||
__wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
|
||||
u64 wbt_get_min_lat(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (!rqos)
|
||||
return 0;
|
||||
return RQWB(rqos)->min_lat_nsec;
|
||||
}
|
||||
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (!rqos)
|
||||
return;
|
||||
RQWB(rqos)->min_lat_nsec = val;
|
||||
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
|
||||
__wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
|
||||
|
||||
static bool close_io(struct rq_wb *rwb)
|
||||
{
|
||||
const unsigned long now = jiffies;
|
||||
@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
|
||||
* IO for a bit.
|
||||
*/
|
||||
if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
|
||||
limit = rwb->wb_max;
|
||||
limit = rwb->rq_depth.max_depth;
|
||||
else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
|
||||
/*
|
||||
* If less than 100ms since we completed unrelated IO,
|
||||
@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
|
||||
return limit;
|
||||
}
|
||||
|
||||
static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
|
||||
wait_queue_entry_t *wait, unsigned long rw)
|
||||
{
|
||||
/*
|
||||
* inc it here even if disabled, since we'll dec it at completion.
|
||||
* this only happens if the task was sleeping in __wbt_wait(),
|
||||
* and someone turned it off at the same time.
|
||||
*/
|
||||
if (!rwb_enabled(rwb)) {
|
||||
atomic_inc(&rqw->inflight);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the waitqueue is already active and we are not the next
|
||||
* in line to be woken up, wait for our turn.
|
||||
*/
|
||||
if (waitqueue_active(&rqw->wait) &&
|
||||
rqw->wait.head.next != &wait->entry)
|
||||
return false;
|
||||
|
||||
return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
|
||||
}
|
||||
|
||||
/*
|
||||
* Block if we will exceed our limit, or if we are currently waiting for
|
||||
* the timer to kick off queuing again.
|
||||
@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
|
||||
__acquires(lock)
|
||||
{
|
||||
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
|
||||
DEFINE_WAIT(wait);
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
|
||||
if (may_queue(rwb, rqw, &wait, rw))
|
||||
/*
|
||||
* inc it here even if disabled, since we'll dec it at completion.
|
||||
* this only happens if the task was sleeping in __wbt_wait(),
|
||||
* and someone turned it off at the same time.
|
||||
*/
|
||||
if (!rwb_enabled(rwb)) {
|
||||
atomic_inc(&rqw->inflight);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!waitqueue_active(&rqw->wait)
|
||||
&& rq_wait_inc_below(rqw, get_limit(rwb, rw)))
|
||||
return;
|
||||
|
||||
add_wait_queue_exclusive(&rqw->wait, &wait);
|
||||
do {
|
||||
prepare_to_wait_exclusive(&rqw->wait, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
|
||||
if (may_queue(rwb, rqw, &wait, rw))
|
||||
if (!rwb_enabled(rwb)) {
|
||||
atomic_inc(&rqw->inflight);
|
||||
break;
|
||||
}
|
||||
|
||||
if (rq_wait_inc_below(rqw, get_limit(rwb, rw)))
|
||||
break;
|
||||
|
||||
if (lock) {
|
||||
@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
|
||||
io_schedule();
|
||||
} while (1);
|
||||
|
||||
finish_wait(&rqw->wait, &wait);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
remove_wait_queue(&rqw->wait, &wait);
|
||||
}
|
||||
|
||||
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
|
||||
@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
|
||||
}
|
||||
}
|
||||
|
||||
static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
|
||||
{
|
||||
enum wbt_flags flags = 0;
|
||||
|
||||
if (bio_op(bio) == REQ_OP_READ) {
|
||||
flags = WBT_READ;
|
||||
} else if (wbt_should_throttle(rwb, bio)) {
|
||||
if (current_is_kswapd())
|
||||
flags |= WBT_KSWAPD;
|
||||
if (bio_op(bio) == REQ_OP_DISCARD)
|
||||
flags |= WBT_DISCARD;
|
||||
flags |= WBT_TRACKED;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
|
||||
__wbt_done(rqos, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the IO request should be accounted, false if not.
|
||||
* May sleep, if we have exceeded the writeback limits. Caller can pass
|
||||
* in an irq held spinlock, if it holds one when calling this function.
|
||||
* If we do sleep, we'll release and re-grab it.
|
||||
*/
|
||||
enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
|
||||
static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
|
||||
{
|
||||
enum wbt_flags ret = 0;
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
enum wbt_flags flags;
|
||||
|
||||
if (!rwb_enabled(rwb))
|
||||
return 0;
|
||||
return;
|
||||
|
||||
if (bio_op(bio) == REQ_OP_READ)
|
||||
ret = WBT_READ;
|
||||
flags = bio_to_wbt_flags(rwb, bio);
|
||||
|
||||
if (!wbt_should_throttle(rwb, bio)) {
|
||||
if (ret & WBT_READ)
|
||||
if (flags & WBT_READ)
|
||||
wb_timestamp(rwb, &rwb->last_issue);
|
||||
return ret;
|
||||
return;
|
||||
}
|
||||
|
||||
if (current_is_kswapd())
|
||||
ret |= WBT_KSWAPD;
|
||||
flags |= WBT_KSWAPD;
|
||||
if (bio_op(bio) == REQ_OP_DISCARD)
|
||||
ret |= WBT_DISCARD;
|
||||
flags |= WBT_DISCARD;
|
||||
|
||||
__wbt_wait(rwb, ret, bio->bi_opf, lock);
|
||||
__wbt_wait(rwb, flags, bio->bi_opf, lock);
|
||||
|
||||
if (!blk_stat_is_active(rwb->cb))
|
||||
rwb_arm_timer(rwb);
|
||||
|
||||
return ret | WBT_TRACKED;
|
||||
}
|
||||
|
||||
void wbt_issue(struct rq_wb *rwb, struct request *rq)
|
||||
static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
|
||||
}
|
||||
|
||||
void wbt_issue(struct rq_qos *rqos, struct request *rq)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
|
||||
if (!rwb_enabled(rwb))
|
||||
return;
|
||||
|
||||
@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
|
||||
}
|
||||
}
|
||||
|
||||
void wbt_requeue(struct rq_wb *rwb, struct request *rq)
|
||||
void wbt_requeue(struct rq_qos *rqos, struct request *rq)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
if (!rwb_enabled(rwb))
|
||||
return;
|
||||
if (rq == rwb->sync_cookie) {
|
||||
@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
|
||||
}
|
||||
}
|
||||
|
||||
void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
|
||||
void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
if (rwb) {
|
||||
rwb->queue_depth = depth;
|
||||
wbt_update_limits(rwb);
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (rqos) {
|
||||
RQWB(rqos)->rq_depth.queue_depth = depth;
|
||||
__wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
}
|
||||
|
||||
void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
|
||||
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
|
||||
{
|
||||
if (rwb)
|
||||
rwb->wc = write_cache_on;
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (rqos)
|
||||
RQWB(rqos)->wc = write_cache_on;
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable wbt, if enabled by default.
|
||||
*/
|
||||
void wbt_disable_default(struct request_queue *q)
|
||||
{
|
||||
struct rq_wb *rwb = q->rq_wb;
|
||||
|
||||
if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
|
||||
wbt_exit(q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbt_disable_default);
|
||||
|
||||
/*
|
||||
* Enable wbt if defaults are configured that way
|
||||
*/
|
||||
void wbt_enable_default(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
/* Throttling already enabled? */
|
||||
if (q->rq_wb)
|
||||
if (rqos)
|
||||
return;
|
||||
|
||||
/* Queue not registered? Maybe shutting down... */
|
||||
@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq)
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void wbt_exit(struct rq_qos *rqos)
|
||||
{
|
||||
struct rq_wb *rwb = RQWB(rqos);
|
||||
struct request_queue *q = rqos->q;
|
||||
|
||||
blk_stat_remove_callback(q, rwb->cb);
|
||||
blk_stat_free_callback(rwb->cb);
|
||||
kfree(rwb);
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable wbt, if enabled by default.
|
||||
*/
|
||||
void wbt_disable_default(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
struct rq_wb *rwb;
|
||||
if (!rqos)
|
||||
return;
|
||||
rwb = RQWB(rqos);
|
||||
if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
|
||||
rwb->wb_normal = 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbt_disable_default);
|
||||
|
||||
|
||||
static struct rq_qos_ops wbt_rqos_ops = {
|
||||
.throttle = wbt_wait,
|
||||
.issue = wbt_issue,
|
||||
.track = wbt_track,
|
||||
.requeue = wbt_requeue,
|
||||
.done = wbt_done,
|
||||
.cleanup = wbt_cleanup,
|
||||
.exit = wbt_exit,
|
||||
};
|
||||
|
||||
int wbt_init(struct request_queue *q)
|
||||
{
|
||||
struct rq_wb *rwb;
|
||||
@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < WBT_NUM_RWQ; i++) {
|
||||
atomic_set(&rwb->rq_wait[i].inflight, 0);
|
||||
init_waitqueue_head(&rwb->rq_wait[i].wait);
|
||||
}
|
||||
for (i = 0; i < WBT_NUM_RWQ; i++)
|
||||
rq_wait_init(&rwb->rq_wait[i]);
|
||||
|
||||
rwb->rqos.id = RQ_QOS_WBT;
|
||||
rwb->rqos.ops = &wbt_rqos_ops;
|
||||
rwb->rqos.q = q;
|
||||
rwb->last_comp = rwb->last_issue = jiffies;
|
||||
rwb->queue = q;
|
||||
rwb->win_nsec = RWB_WINDOW_NSEC;
|
||||
rwb->enable_state = WBT_STATE_ON_DEFAULT;
|
||||
wbt_update_limits(rwb);
|
||||
rwb->wc = 1;
|
||||
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
|
||||
__wbt_update_limits(rwb);
|
||||
|
||||
/*
|
||||
* Assign rwb and add the stats callback.
|
||||
*/
|
||||
q->rq_wb = rwb;
|
||||
rq_qos_add(q, &rwb->rqos);
|
||||
blk_stat_add_callback(q, rwb->cb);
|
||||
|
||||
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
|
||||
|
||||
wbt_set_queue_depth(rwb, blk_queue_depth(q));
|
||||
wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
wbt_set_queue_depth(q, blk_queue_depth(q));
|
||||
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void wbt_exit(struct request_queue *q)
|
||||
{
|
||||
struct rq_wb *rwb = q->rq_wb;
|
||||
|
||||
if (rwb) {
|
||||
blk_stat_remove_callback(q, rwb->cb);
|
||||
blk_stat_free_callback(rwb->cb);
|
||||
q->rq_wb = NULL;
|
||||
kfree(rwb);
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <linux/ktime.h>
|
||||
|
||||
#include "blk-stat.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
enum wbt_flags {
|
||||
WBT_TRACKED = 1, /* write, tracked for throttling */
|
||||
@ -35,20 +36,12 @@ enum {
|
||||
WBT_STATE_ON_MANUAL = 2,
|
||||
};
|
||||
|
||||
struct rq_wait {
|
||||
wait_queue_head_t wait;
|
||||
atomic_t inflight;
|
||||
};
|
||||
|
||||
struct rq_wb {
|
||||
/*
|
||||
* Settings that govern how we throttle
|
||||
*/
|
||||
unsigned int wb_background; /* background writeback */
|
||||
unsigned int wb_normal; /* normal writeback */
|
||||
unsigned int wb_max; /* max throughput writeback */
|
||||
int scale_step;
|
||||
bool scaled_max;
|
||||
|
||||
short enable_state; /* WBT_STATE_* */
|
||||
|
||||
@ -67,15 +60,20 @@ struct rq_wb {
|
||||
void *sync_cookie;
|
||||
|
||||
unsigned int wc;
|
||||
unsigned int queue_depth;
|
||||
|
||||
unsigned long last_issue; /* last non-throttled issue */
|
||||
unsigned long last_comp; /* last non-throttled comp */
|
||||
unsigned long min_lat_nsec;
|
||||
struct request_queue *queue;
|
||||
struct rq_qos rqos;
|
||||
struct rq_wait rq_wait[WBT_NUM_RWQ];
|
||||
struct rq_depth rq_depth;
|
||||
};
|
||||
|
||||
static inline struct rq_wb *RQWB(struct rq_qos *rqos)
|
||||
{
|
||||
return container_of(rqos, struct rq_wb, rqos);
|
||||
}
|
||||
|
||||
static inline unsigned int wbt_inflight(struct rq_wb *rwb)
|
||||
{
|
||||
unsigned int i, ret = 0;
|
||||
@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_BLK_WBT
|
||||
|
||||
static inline void wbt_track(struct request *rq, enum wbt_flags flags)
|
||||
{
|
||||
rq->wbt_flags |= flags;
|
||||
}
|
||||
|
||||
void __wbt_done(struct rq_wb *, enum wbt_flags);
|
||||
void wbt_done(struct rq_wb *, struct request *);
|
||||
enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
|
||||
int wbt_init(struct request_queue *);
|
||||
void wbt_exit(struct request_queue *);
|
||||
void wbt_update_limits(struct rq_wb *);
|
||||
void wbt_requeue(struct rq_wb *, struct request *);
|
||||
void wbt_issue(struct rq_wb *, struct request *);
|
||||
void wbt_update_limits(struct request_queue *);
|
||||
void wbt_disable_default(struct request_queue *);
|
||||
void wbt_enable_default(struct request_queue *);
|
||||
|
||||
void wbt_set_queue_depth(struct rq_wb *, unsigned int);
|
||||
void wbt_set_write_cache(struct rq_wb *, bool);
|
||||
u64 wbt_get_min_lat(struct request_queue *q);
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val);
|
||||
|
||||
void wbt_set_queue_depth(struct request_queue *, unsigned int);
|
||||
void wbt_set_write_cache(struct request_queue *, bool);
|
||||
|
||||
u64 wbt_default_latency_nsec(struct request_queue *);
|
||||
|
||||
@ -114,31 +105,11 @@ u64 wbt_default_latency_nsec(struct request_queue *);
|
||||
static inline void wbt_track(struct request *rq, enum wbt_flags flags)
|
||||
{
|
||||
}
|
||||
static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
|
||||
{
|
||||
}
|
||||
static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
|
||||
{
|
||||
}
|
||||
static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
|
||||
spinlock_t *lock)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int wbt_init(struct request_queue *q)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
static inline void wbt_exit(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
static inline void wbt_update_limits(struct rq_wb *rwb)
|
||||
{
|
||||
}
|
||||
static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
|
||||
{
|
||||
}
|
||||
static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
|
||||
static inline void wbt_update_limits(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
static inline void wbt_disable_default(struct request_queue *q)
|
||||
@ -147,10 +118,17 @@ static inline void wbt_disable_default(struct request_queue *q)
|
||||
static inline void wbt_enable_default(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
|
||||
static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
|
||||
{
|
||||
}
|
||||
static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
|
||||
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
|
||||
{
|
||||
}
|
||||
static inline u64 wbt_get_min_lat(struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
|
||||
{
|
||||
}
|
||||
static inline u64 wbt_default_latency_nsec(struct request_queue *q)
|
||||
|
@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev,
|
||||
/* Get header in the first page */
|
||||
ofst = 0;
|
||||
if (!nr_rep) {
|
||||
hdr = (struct blk_zone_report_hdr *) addr;
|
||||
hdr = addr;
|
||||
nr_rep = hdr->nr_zones;
|
||||
ofst = sizeof(struct blk_zone_report_hdr);
|
||||
}
|
||||
|
@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
|
||||
int blk_init_rl(struct request_list *rl, struct request_queue *q,
|
||||
gfp_t gfp_mask);
|
||||
void blk_exit_rl(struct request_queue *q, struct request_list *rl);
|
||||
void blk_exit_queue(struct request_queue *q);
|
||||
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
|
||||
struct bio *bio);
|
||||
void blk_queue_bypass_start(struct request_queue *q);
|
||||
@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
|
||||
|
||||
extern void blk_drain_queue(struct request_queue *q);
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
|
||||
extern int blk_iolatency_init(struct request_queue *q);
|
||||
#else
|
||||
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
|
||||
#endif
|
||||
|
||||
#endif /* BLK_INTERNAL_H */
|
||||
|
@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
|
||||
__bounce_end_io_read(bio, &isa_page_pool);
|
||||
}
|
||||
|
||||
static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
|
||||
struct bio_set *bs)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
struct bio *bio;
|
||||
|
||||
/*
|
||||
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
|
||||
* bio_src->bi_io_vec to bio->bi_io_vec.
|
||||
*
|
||||
* We can't do that anymore, because:
|
||||
*
|
||||
* - The point of cloning the biovec is to produce a bio with a biovec
|
||||
* the caller can modify: bi_idx and bi_bvec_done should be 0.
|
||||
*
|
||||
* - The original bio could've had more than BIO_MAX_PAGES biovecs; if
|
||||
* we tried to clone the whole thing bio_alloc_bioset() would fail.
|
||||
* But the clone should succeed as long as the number of biovecs we
|
||||
* actually need to allocate is fewer than BIO_MAX_PAGES.
|
||||
*
|
||||
* - Lastly, bi_vcnt should not be looked at or relied upon by code
|
||||
* that does not own the bio - reason being drivers don't use it for
|
||||
* iterating over the biovec anymore, so expecting it to be kept up
|
||||
* to date (i.e. for clones that share the parent biovec) is just
|
||||
* asking for trouble and would force extra work on
|
||||
* __bio_clone_fast() anyways.
|
||||
*/
|
||||
|
||||
bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
|
||||
if (!bio)
|
||||
return NULL;
|
||||
bio->bi_disk = bio_src->bi_disk;
|
||||
bio->bi_opf = bio_src->bi_opf;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
break;
|
||||
case REQ_OP_WRITE_SAME:
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
|
||||
break;
|
||||
default:
|
||||
bio_for_each_segment(bv, bio_src, iter)
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bv;
|
||||
break;
|
||||
}
|
||||
|
||||
if (bio_integrity(bio_src)) {
|
||||
int ret;
|
||||
|
||||
ret = bio_integrity_clone(bio, bio_src, gfp_mask);
|
||||
if (ret < 0) {
|
||||
bio_put(bio);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bio_clone_blkcg_association(bio, bio_src);
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
||||
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
|
||||
mempool_t *pool)
|
||||
{
|
||||
@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
|
||||
generic_make_request(*bio_orig);
|
||||
*bio_orig = bio;
|
||||
}
|
||||
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
|
||||
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
|
||||
&bounce_bio_set);
|
||||
|
||||
bio_for_each_segment_all(to, bio, i) {
|
||||
|
@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
|
||||
|
||||
job->request_len = hdr->request_len;
|
||||
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
|
||||
if (IS_ERR(job->request))
|
||||
return PTR_ERR(job->request);
|
||||
return 0;
|
||||
|
||||
return PTR_ERR_OR_ZERO(job->request);
|
||||
}
|
||||
|
||||
static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
|
||||
|
458
block/bsg.c
458
block/bsg.c
@ -13,11 +13,9 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/cdev.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/bsg.h>
|
||||
#include <linux/slab.h>
|
||||
@ -38,21 +36,10 @@
|
||||
struct bsg_device {
|
||||
struct request_queue *queue;
|
||||
spinlock_t lock;
|
||||
struct list_head busy_list;
|
||||
struct list_head done_list;
|
||||
struct hlist_node dev_list;
|
||||
atomic_t ref_count;
|
||||
int queued_cmds;
|
||||
int done_cmds;
|
||||
wait_queue_head_t wq_done;
|
||||
wait_queue_head_t wq_free;
|
||||
char name[20];
|
||||
int max_queue;
|
||||
unsigned long flags;
|
||||
};
|
||||
|
||||
enum {
|
||||
BSG_F_BLOCK = 1,
|
||||
};
|
||||
|
||||
#define BSG_DEFAULT_CMDS 64
|
||||
@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
|
||||
static struct class *bsg_class;
|
||||
static int bsg_major;
|
||||
|
||||
static struct kmem_cache *bsg_cmd_cachep;
|
||||
|
||||
/*
|
||||
* our internal command type
|
||||
*/
|
||||
struct bsg_command {
|
||||
struct bsg_device *bd;
|
||||
struct list_head list;
|
||||
struct request *rq;
|
||||
struct bio *bio;
|
||||
struct bio *bidi_bio;
|
||||
int err;
|
||||
struct sg_io_v4 hdr;
|
||||
};
|
||||
|
||||
static void bsg_free_command(struct bsg_command *bc)
|
||||
{
|
||||
struct bsg_device *bd = bc->bd;
|
||||
unsigned long flags;
|
||||
|
||||
kmem_cache_free(bsg_cmd_cachep, bc);
|
||||
|
||||
spin_lock_irqsave(&bd->lock, flags);
|
||||
bd->queued_cmds--;
|
||||
spin_unlock_irqrestore(&bd->lock, flags);
|
||||
|
||||
wake_up(&bd->wq_free);
|
||||
}
|
||||
|
||||
static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
|
||||
{
|
||||
struct bsg_command *bc = ERR_PTR(-EINVAL);
|
||||
|
||||
spin_lock_irq(&bd->lock);
|
||||
|
||||
if (bd->queued_cmds >= bd->max_queue)
|
||||
goto out;
|
||||
|
||||
bd->queued_cmds++;
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
|
||||
if (unlikely(!bc)) {
|
||||
spin_lock_irq(&bd->lock);
|
||||
bd->queued_cmds--;
|
||||
bc = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
bc->bd = bd;
|
||||
INIT_LIST_HEAD(&bc->list);
|
||||
bsg_dbg(bd, "returning free cmd %p\n", bc);
|
||||
return bc;
|
||||
out:
|
||||
spin_unlock_irq(&bd->lock);
|
||||
return bc;
|
||||
}
|
||||
|
||||
static inline struct hlist_head *bsg_dev_idx_hash(int index)
|
||||
{
|
||||
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
|
||||
@ -285,101 +214,6 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* async completion call-back from the block layer, when scsi/ide/whatever
|
||||
* calls end_that_request_last() on a request
|
||||
*/
|
||||
static void bsg_rq_end_io(struct request *rq, blk_status_t status)
|
||||
{
|
||||
struct bsg_command *bc = rq->end_io_data;
|
||||
struct bsg_device *bd = bc->bd;
|
||||
unsigned long flags;
|
||||
|
||||
bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
|
||||
rq, bc, bc->bio);
|
||||
|
||||
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
|
||||
|
||||
spin_lock_irqsave(&bd->lock, flags);
|
||||
list_move_tail(&bc->list, &bd->done_list);
|
||||
bd->done_cmds++;
|
||||
spin_unlock_irqrestore(&bd->lock, flags);
|
||||
|
||||
wake_up(&bd->wq_done);
|
||||
}
|
||||
|
||||
/*
|
||||
* do final setup of a 'bc' and submit the matching 'rq' to the block
|
||||
* layer for io
|
||||
*/
|
||||
static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
|
||||
struct bsg_command *bc, struct request *rq)
|
||||
{
|
||||
int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
|
||||
|
||||
/*
|
||||
* add bc command to busy queue and submit rq for io
|
||||
*/
|
||||
bc->rq = rq;
|
||||
bc->bio = rq->bio;
|
||||
if (rq->next_rq)
|
||||
bc->bidi_bio = rq->next_rq->bio;
|
||||
bc->hdr.duration = jiffies;
|
||||
spin_lock_irq(&bd->lock);
|
||||
list_add_tail(&bc->list, &bd->busy_list);
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
|
||||
|
||||
rq->end_io_data = bc;
|
||||
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
|
||||
}
|
||||
|
||||
static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
|
||||
{
|
||||
struct bsg_command *bc = NULL;
|
||||
|
||||
spin_lock_irq(&bd->lock);
|
||||
if (bd->done_cmds) {
|
||||
bc = list_first_entry(&bd->done_list, struct bsg_command, list);
|
||||
list_del(&bc->list);
|
||||
bd->done_cmds--;
|
||||
}
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
return bc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a finished command from the done list
|
||||
*/
|
||||
static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
|
||||
{
|
||||
struct bsg_command *bc;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
bc = bsg_next_done_cmd(bd);
|
||||
if (bc)
|
||||
break;
|
||||
|
||||
if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
|
||||
bc = ERR_PTR(-EAGAIN);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
|
||||
if (ret) {
|
||||
bc = ERR_PTR(-ERESTARTSYS);
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
|
||||
bsg_dbg(bd, "returning done %p\n", bc);
|
||||
|
||||
return bc;
|
||||
}
|
||||
|
||||
static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
|
||||
struct bio *bio, struct bio *bidi_bio)
|
||||
{
|
||||
@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool bsg_complete(struct bsg_device *bd)
|
||||
{
|
||||
bool ret = false;
|
||||
bool spin;
|
||||
|
||||
do {
|
||||
spin_lock_irq(&bd->lock);
|
||||
|
||||
BUG_ON(bd->done_cmds > bd->queued_cmds);
|
||||
|
||||
/*
|
||||
* All commands consumed.
|
||||
*/
|
||||
if (bd->done_cmds == bd->queued_cmds)
|
||||
ret = true;
|
||||
|
||||
spin = !test_bit(BSG_F_BLOCK, &bd->flags);
|
||||
|
||||
spin_unlock_irq(&bd->lock);
|
||||
} while (!ret && spin);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bsg_complete_all_commands(struct bsg_device *bd)
|
||||
{
|
||||
struct bsg_command *bc;
|
||||
int ret, tret;
|
||||
|
||||
bsg_dbg(bd, "entered\n");
|
||||
|
||||
/*
|
||||
* wait for all commands to complete
|
||||
*/
|
||||
io_wait_event(bd->wq_done, bsg_complete(bd));
|
||||
|
||||
/*
|
||||
* discard done commands
|
||||
*/
|
||||
ret = 0;
|
||||
do {
|
||||
spin_lock_irq(&bd->lock);
|
||||
if (!bd->queued_cmds) {
|
||||
spin_unlock_irq(&bd->lock);
|
||||
break;
|
||||
}
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
bc = bsg_get_done_cmd(bd);
|
||||
if (IS_ERR(bc))
|
||||
break;
|
||||
|
||||
tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
|
||||
bc->bidi_bio);
|
||||
if (!ret)
|
||||
ret = tret;
|
||||
|
||||
bsg_free_command(bc);
|
||||
} while (1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
|
||||
const struct iovec *iov, ssize_t *bytes_read)
|
||||
{
|
||||
struct bsg_command *bc;
|
||||
int nr_commands, ret;
|
||||
|
||||
if (count % sizeof(struct sg_io_v4))
|
||||
return -EINVAL;
|
||||
|
||||
ret = 0;
|
||||
nr_commands = count / sizeof(struct sg_io_v4);
|
||||
while (nr_commands) {
|
||||
bc = bsg_get_done_cmd(bd);
|
||||
if (IS_ERR(bc)) {
|
||||
ret = PTR_ERR(bc);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* this is the only case where we need to copy data back
|
||||
* after completing the request. so do that here,
|
||||
* bsg_complete_work() cannot do that for us
|
||||
*/
|
||||
ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
|
||||
bc->bidi_bio);
|
||||
|
||||
if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
|
||||
ret = -EFAULT;
|
||||
|
||||
bsg_free_command(bc);
|
||||
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
buf += sizeof(struct sg_io_v4);
|
||||
*bytes_read += sizeof(struct sg_io_v4);
|
||||
nr_commands--;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
|
||||
{
|
||||
if (file->f_flags & O_NONBLOCK)
|
||||
clear_bit(BSG_F_BLOCK, &bd->flags);
|
||||
else
|
||||
set_bit(BSG_F_BLOCK, &bd->flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the error is a "real" error that we should return.
|
||||
*/
|
||||
static inline int err_block_err(int ret)
|
||||
{
|
||||
if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
struct bsg_device *bd = file->private_data;
|
||||
int ret;
|
||||
ssize_t bytes_read;
|
||||
|
||||
bsg_dbg(bd, "read %zd bytes\n", count);
|
||||
|
||||
bsg_set_block(bd, file);
|
||||
|
||||
bytes_read = 0;
|
||||
ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
|
||||
*ppos = bytes_read;
|
||||
|
||||
if (!bytes_read || err_block_err(ret))
|
||||
bytes_read = ret;
|
||||
|
||||
return bytes_read;
|
||||
}
|
||||
|
||||
static int __bsg_write(struct bsg_device *bd, const char __user *buf,
|
||||
size_t count, ssize_t *bytes_written, fmode_t mode)
|
||||
{
|
||||
struct bsg_command *bc;
|
||||
struct request *rq;
|
||||
int ret, nr_commands;
|
||||
|
||||
if (count % sizeof(struct sg_io_v4))
|
||||
return -EINVAL;
|
||||
|
||||
nr_commands = count / sizeof(struct sg_io_v4);
|
||||
rq = NULL;
|
||||
bc = NULL;
|
||||
ret = 0;
|
||||
while (nr_commands) {
|
||||
struct request_queue *q = bd->queue;
|
||||
|
||||
bc = bsg_alloc_command(bd);
|
||||
if (IS_ERR(bc)) {
|
||||
ret = PTR_ERR(bc);
|
||||
bc = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* get a request, fill in the blanks, and add to request queue
|
||||
*/
|
||||
rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
|
||||
if (IS_ERR(rq)) {
|
||||
ret = PTR_ERR(rq);
|
||||
rq = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
bsg_add_command(bd, q, bc, rq);
|
||||
bc = NULL;
|
||||
rq = NULL;
|
||||
nr_commands--;
|
||||
buf += sizeof(struct sg_io_v4);
|
||||
*bytes_written += sizeof(struct sg_io_v4);
|
||||
}
|
||||
|
||||
if (bc)
|
||||
bsg_free_command(bc);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
struct bsg_device *bd = file->private_data;
|
||||
ssize_t bytes_written;
|
||||
int ret;
|
||||
|
||||
bsg_dbg(bd, "write %zd bytes\n", count);
|
||||
|
||||
if (unlikely(uaccess_kernel()))
|
||||
return -EINVAL;
|
||||
|
||||
bsg_set_block(bd, file);
|
||||
|
||||
bytes_written = 0;
|
||||
ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
|
||||
|
||||
*ppos = bytes_written;
|
||||
|
||||
/*
|
||||
* return bytes written on non-fatal errors
|
||||
*/
|
||||
if (!bytes_written || err_block_err(ret))
|
||||
bytes_written = ret;
|
||||
|
||||
bsg_dbg(bd, "returning %zd\n", bytes_written);
|
||||
return bytes_written;
|
||||
}
|
||||
|
||||
static struct bsg_device *bsg_alloc_device(void)
|
||||
{
|
||||
struct bsg_device *bd;
|
||||
@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void)
|
||||
return NULL;
|
||||
|
||||
spin_lock_init(&bd->lock);
|
||||
|
||||
bd->max_queue = BSG_DEFAULT_CMDS;
|
||||
|
||||
INIT_LIST_HEAD(&bd->busy_list);
|
||||
INIT_LIST_HEAD(&bd->done_list);
|
||||
INIT_HLIST_NODE(&bd->dev_list);
|
||||
|
||||
init_waitqueue_head(&bd->wq_free);
|
||||
init_waitqueue_head(&bd->wq_done);
|
||||
return bd;
|
||||
}
|
||||
|
||||
static int bsg_put_device(struct bsg_device *bd)
|
||||
{
|
||||
int ret = 0, do_free;
|
||||
struct request_queue *q = bd->queue;
|
||||
|
||||
mutex_lock(&bsg_mutex);
|
||||
|
||||
do_free = atomic_dec_and_test(&bd->ref_count);
|
||||
if (!do_free) {
|
||||
if (!atomic_dec_and_test(&bd->ref_count)) {
|
||||
mutex_unlock(&bsg_mutex);
|
||||
goto out;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hlist_del(&bd->dev_list);
|
||||
@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd)
|
||||
/*
|
||||
* close can always block
|
||||
*/
|
||||
set_bit(BSG_F_BLOCK, &bd->flags);
|
||||
|
||||
/*
|
||||
* correct error detection baddies here again. it's the responsibility
|
||||
* of the app to properly reap commands before close() if it wants
|
||||
* fool-proof error detection
|
||||
*/
|
||||
ret = bsg_complete_all_commands(bd);
|
||||
|
||||
kfree(bd);
|
||||
out:
|
||||
if (do_free)
|
||||
blk_put_queue(q);
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bsg_device *bsg_add_device(struct inode *inode,
|
||||
@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
|
||||
|
||||
bd->queue = rq;
|
||||
|
||||
bsg_set_block(bd, file);
|
||||
|
||||
atomic_set(&bd->ref_count, 1);
|
||||
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
|
||||
|
||||
@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file)
|
||||
return bsg_put_device(bd);
|
||||
}
|
||||
|
||||
static __poll_t bsg_poll(struct file *file, poll_table *wait)
|
||||
{
|
||||
struct bsg_device *bd = file->private_data;
|
||||
__poll_t mask = 0;
|
||||
|
||||
poll_wait(file, &bd->wq_done, wait);
|
||||
poll_wait(file, &bd->wq_free, wait);
|
||||
|
||||
spin_lock_irq(&bd->lock);
|
||||
if (!list_empty(&bd->done_list))
|
||||
mask |= EPOLLIN | EPOLLRDNORM;
|
||||
if (bd->queued_cmds < bd->max_queue)
|
||||
mask |= EPOLLOUT;
|
||||
spin_unlock_irq(&bd->lock);
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
struct bsg_device *bd = file->private_data;
|
||||
@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
}
|
||||
|
||||
static const struct file_operations bsg_fops = {
|
||||
.read = bsg_read,
|
||||
.write = bsg_write,
|
||||
.poll = bsg_poll,
|
||||
.open = bsg_open,
|
||||
.release = bsg_release,
|
||||
.unlocked_ioctl = bsg_ioctl,
|
||||
@ -977,21 +540,12 @@ static int __init bsg_init(void)
|
||||
int ret, i;
|
||||
dev_t devid;
|
||||
|
||||
bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
|
||||
sizeof(struct bsg_command), 0, 0, NULL);
|
||||
if (!bsg_cmd_cachep) {
|
||||
printk(KERN_ERR "bsg: failed creating slab cache\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
|
||||
INIT_HLIST_HEAD(&bsg_device_list[i]);
|
||||
|
||||
bsg_class = class_create(THIS_MODULE, "bsg");
|
||||
if (IS_ERR(bsg_class)) {
|
||||
ret = PTR_ERR(bsg_class);
|
||||
goto destroy_kmemcache;
|
||||
}
|
||||
if (IS_ERR(bsg_class))
|
||||
return PTR_ERR(bsg_class);
|
||||
bsg_class->devnode = bsg_devnode;
|
||||
|
||||
ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
|
||||
@ -1012,8 +566,6 @@ static int __init bsg_init(void)
|
||||
unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
|
||||
destroy_bsg_class:
|
||||
class_destroy(bsg_class);
|
||||
destroy_kmemcache:
|
||||
kmem_cache_destroy(bsg_cmd_cachep);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
|
||||
switch (ioprio_class) {
|
||||
default:
|
||||
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
|
||||
/* fall through */
|
||||
case IOPRIO_CLASS_NONE:
|
||||
/*
|
||||
* no prio set, inherit CPU scheduling settings
|
||||
@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
|
||||
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
||||
{ \
|
||||
struct cfq_data *cfqd = e->elevator_data; \
|
||||
unsigned int __data; \
|
||||
unsigned int __data, __min = (MIN), __max = (MAX); \
|
||||
\
|
||||
cfq_var_store(&__data, (page)); \
|
||||
if (__data < (MIN)) \
|
||||
__data = (MIN); \
|
||||
else if (__data > (MAX)) \
|
||||
__data = (MAX); \
|
||||
if (__data < __min) \
|
||||
__data = __min; \
|
||||
else if (__data > __max) \
|
||||
__data = __max; \
|
||||
if (__CONV) \
|
||||
*(__PTR) = (u64)__data * NSEC_PER_MSEC; \
|
||||
else \
|
||||
@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX,
|
||||
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
||||
{ \
|
||||
struct cfq_data *cfqd = e->elevator_data; \
|
||||
unsigned int __data; \
|
||||
unsigned int __data, __min = (MIN), __max = (MAX); \
|
||||
\
|
||||
cfq_var_store(&__data, (page)); \
|
||||
if (__data < (MIN)) \
|
||||
__data = (MIN); \
|
||||
else if (__data > (MAX)) \
|
||||
__data = (MAX); \
|
||||
if (__data < __min) \
|
||||
__data = __min; \
|
||||
else if (__data > __max) \
|
||||
__data = __max; \
|
||||
*(__PTR) = (u64)__data * NSEC_PER_USEC; \
|
||||
return count; \
|
||||
}
|
||||
|
@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
|
||||
part_round_stats(gp->queue, cpu, hd);
|
||||
part_stat_unlock();
|
||||
part_in_flight(gp->queue, hd, inflight);
|
||||
seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
|
||||
"%u %lu %lu %lu %u %u %u %u\n",
|
||||
seq_printf(seqf, "%4d %7d %s "
|
||||
"%lu %lu %lu %u "
|
||||
"%lu %lu %lu %u "
|
||||
"%u %u %u "
|
||||
"%lu %lu %lu %u\n",
|
||||
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
|
||||
disk_name(gp, hd->partno, buf),
|
||||
part_stat_read(hd, ios[READ]),
|
||||
part_stat_read(hd, merges[READ]),
|
||||
part_stat_read(hd, sectors[READ]),
|
||||
jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
|
||||
part_stat_read(hd, ios[WRITE]),
|
||||
part_stat_read(hd, merges[WRITE]),
|
||||
part_stat_read(hd, sectors[WRITE]),
|
||||
jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
|
||||
part_stat_read(hd, ios[STAT_READ]),
|
||||
part_stat_read(hd, merges[STAT_READ]),
|
||||
part_stat_read(hd, sectors[STAT_READ]),
|
||||
jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
|
||||
part_stat_read(hd, ios[STAT_WRITE]),
|
||||
part_stat_read(hd, merges[STAT_WRITE]),
|
||||
part_stat_read(hd, sectors[STAT_WRITE]),
|
||||
jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
|
||||
inflight[0],
|
||||
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
|
||||
jiffies_to_msecs(part_stat_read(hd, time_in_queue))
|
||||
jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
|
||||
part_stat_read(hd, ios[STAT_DISCARD]),
|
||||
part_stat_read(hd, merges[STAT_DISCARD]),
|
||||
part_stat_read(hd, sectors[STAT_DISCARD]),
|
||||
jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
|
||||
);
|
||||
}
|
||||
disk_part_iter_exit(&piter);
|
||||
|
@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev,
|
||||
return sprintf(buf,
|
||||
"%8lu %8lu %8llu %8u "
|
||||
"%8lu %8lu %8llu %8u "
|
||||
"%8u %8u %8u"
|
||||
"%8u %8u %8u "
|
||||
"%8lu %8lu %8llu %8u"
|
||||
"\n",
|
||||
part_stat_read(p, ios[READ]),
|
||||
part_stat_read(p, merges[READ]),
|
||||
(unsigned long long)part_stat_read(p, sectors[READ]),
|
||||
jiffies_to_msecs(part_stat_read(p, ticks[READ])),
|
||||
part_stat_read(p, ios[WRITE]),
|
||||
part_stat_read(p, merges[WRITE]),
|
||||
(unsigned long long)part_stat_read(p, sectors[WRITE]),
|
||||
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
|
||||
part_stat_read(p, ios[STAT_READ]),
|
||||
part_stat_read(p, merges[STAT_READ]),
|
||||
(unsigned long long)part_stat_read(p, sectors[STAT_READ]),
|
||||
jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
|
||||
part_stat_read(p, ios[STAT_WRITE]),
|
||||
part_stat_read(p, merges[STAT_WRITE]),
|
||||
(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
|
||||
jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
|
||||
inflight[0],
|
||||
jiffies_to_msecs(part_stat_read(p, io_ticks)),
|
||||
jiffies_to_msecs(part_stat_read(p, time_in_queue)));
|
||||
jiffies_to_msecs(part_stat_read(p, time_in_queue)),
|
||||
part_stat_read(p, ios[STAT_DISCARD]),
|
||||
part_stat_read(p, merges[STAT_DISCARD]),
|
||||
(unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
|
||||
jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
|
||||
}
|
||||
|
||||
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
|
||||
|
@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state)
|
||||
u32 vgda_sector = 0;
|
||||
u32 vgda_len = 0;
|
||||
int numlvs = 0;
|
||||
struct pvd *pvd;
|
||||
struct pvd *pvd = NULL;
|
||||
struct lv_info {
|
||||
unsigned short pps_per_lv;
|
||||
unsigned short pps_found;
|
||||
@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state)
|
||||
if (lvip[i].pps_per_lv)
|
||||
foundlvs += 1;
|
||||
}
|
||||
/* pvd loops depend on n[].name and lvip[].pps_per_lv */
|
||||
pvd = alloc_pvd(state, vgda_sector + 17);
|
||||
}
|
||||
put_dev_sector(sect);
|
||||
}
|
||||
pvd = alloc_pvd(state, vgda_sector + 17);
|
||||
if (pvd) {
|
||||
int numpps = be16_to_cpu(pvd->pp_count);
|
||||
int psn_part1 = be32_to_cpu(pvd->psn_part1);
|
||||
@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state)
|
||||
next_lp_ix += 1;
|
||||
}
|
||||
for (i = 0; i < state->limit; i += 1)
|
||||
if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
|
||||
if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) {
|
||||
char tmp[sizeof(n[i].name) + 1]; // null char
|
||||
|
||||
snprintf(tmp, sizeof(tmp), "%s", n[i].name);
|
||||
pr_warn("partition %s (%u pp's found) is "
|
||||
"not contiguous\n",
|
||||
n[i].name, lvip[i].pps_found);
|
||||
tmp, lvip[i].pps_found);
|
||||
}
|
||||
kfree(pvd);
|
||||
}
|
||||
kfree(n);
|
||||
|
@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
|
||||
{
|
||||
char buf[64];
|
||||
int r_objid, r_name, r_id1, r_id2, len;
|
||||
struct vblk_dgrp *dgrp;
|
||||
|
||||
BUG_ON (!buffer || !vb);
|
||||
|
||||
@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
|
||||
if (len != get_unaligned_be32(buffer + 0x14))
|
||||
return false;
|
||||
|
||||
dgrp = &vb->vblk.dgrp;
|
||||
|
||||
ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
|
||||
return true;
|
||||
}
|
||||
|
110
block/t10-pi.c
110
block/t10-pi.c
@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
|
||||
.verify_fn = t10_pi_type3_verify_ip,
|
||||
};
|
||||
EXPORT_SYMBOL(t10_pi_type3_ip);
|
||||
|
||||
/**
|
||||
* t10_pi_prepare - prepare PI prior submitting request to device
|
||||
* @rq: request with PI that should be prepared
|
||||
* @protection_type: PI type (Type 1/Type 2/Type 3)
|
||||
*
|
||||
* For Type 1/Type 2, the virtual start sector is the one that was
|
||||
* originally submitted by the block layer for the ref_tag usage. Due to
|
||||
* partitioning, MD/DM cloning, etc. the actual physical start sector is
|
||||
* likely to be different. Remap protection information to match the
|
||||
* physical LBA.
|
||||
*
|
||||
* Type 3 does not have a reference tag so no remapping is required.
|
||||
*/
|
||||
void t10_pi_prepare(struct request *rq, u8 protection_type)
|
||||
{
|
||||
const int tuple_sz = rq->q->integrity.tuple_size;
|
||||
u32 ref_tag = t10_pi_ref_tag(rq);
|
||||
struct bio *bio;
|
||||
|
||||
if (protection_type == T10_PI_TYPE3_PROTECTION)
|
||||
return;
|
||||
|
||||
__rq_for_each_bio(bio, rq) {
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
u32 virt = bip_get_seed(bip) & 0xffffffff;
|
||||
struct bio_vec iv;
|
||||
struct bvec_iter iter;
|
||||
|
||||
/* Already remapped? */
|
||||
if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
|
||||
break;
|
||||
|
||||
bip_for_each_vec(iv, bip, iter) {
|
||||
void *p, *pmap;
|
||||
unsigned int j;
|
||||
|
||||
pmap = kmap_atomic(iv.bv_page);
|
||||
p = pmap + iv.bv_offset;
|
||||
for (j = 0; j < iv.bv_len; j += tuple_sz) {
|
||||
struct t10_pi_tuple *pi = p;
|
||||
|
||||
if (be32_to_cpu(pi->ref_tag) == virt)
|
||||
pi->ref_tag = cpu_to_be32(ref_tag);
|
||||
virt++;
|
||||
ref_tag++;
|
||||
p += tuple_sz;
|
||||
}
|
||||
|
||||
kunmap_atomic(pmap);
|
||||
}
|
||||
|
||||
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(t10_pi_prepare);
|
||||
|
||||
/**
|
||||
* t10_pi_complete - prepare PI prior returning request to the block layer
|
||||
* @rq: request with PI that should be prepared
|
||||
* @protection_type: PI type (Type 1/Type 2/Type 3)
|
||||
* @intervals: total elements to prepare
|
||||
*
|
||||
* For Type 1/Type 2, the virtual start sector is the one that was
|
||||
* originally submitted by the block layer for the ref_tag usage. Due to
|
||||
* partitioning, MD/DM cloning, etc. the actual physical start sector is
|
||||
* likely to be different. Since the physical start sector was submitted
|
||||
* to the device, we should remap it back to virtual values expected by the
|
||||
* block layer.
|
||||
*
|
||||
* Type 3 does not have a reference tag so no remapping is required.
|
||||
*/
|
||||
void t10_pi_complete(struct request *rq, u8 protection_type,
|
||||
unsigned int intervals)
|
||||
{
|
||||
const int tuple_sz = rq->q->integrity.tuple_size;
|
||||
u32 ref_tag = t10_pi_ref_tag(rq);
|
||||
struct bio *bio;
|
||||
|
||||
if (protection_type == T10_PI_TYPE3_PROTECTION)
|
||||
return;
|
||||
|
||||
__rq_for_each_bio(bio, rq) {
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
u32 virt = bip_get_seed(bip) & 0xffffffff;
|
||||
struct bio_vec iv;
|
||||
struct bvec_iter iter;
|
||||
|
||||
bip_for_each_vec(iv, bip, iter) {
|
||||
void *p, *pmap;
|
||||
unsigned int j;
|
||||
|
||||
pmap = kmap_atomic(iv.bv_page);
|
||||
p = pmap + iv.bv_offset;
|
||||
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
|
||||
struct t10_pi_tuple *pi = p;
|
||||
|
||||
if (be32_to_cpu(pi->ref_tag) == ref_tag)
|
||||
pi->ref_tag = cpu_to_be32(virt);
|
||||
virt++;
|
||||
ref_tag++;
|
||||
intervals--;
|
||||
p += tuple_sz;
|
||||
}
|
||||
|
||||
kunmap_atomic(pmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(t10_pi_complete);
|
||||
|
@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
|
||||
obj-$(CONFIG_NUBUS) += nubus/
|
||||
obj-y += macintosh/
|
||||
obj-$(CONFIG_IDE) += ide/
|
||||
obj-$(CONFIG_SCSI) += scsi/
|
||||
obj-y += scsi/
|
||||
obj-y += nvme/
|
||||
obj-$(CONFIG_ATA) += ata/
|
||||
obj-$(CONFIG_TARGET_CORE) += target/
|
||||
|
@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev,
|
||||
int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
{
|
||||
int rc = 0;
|
||||
u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
|
||||
u8 scsi_cmd[MAX_COMMAND_SIZE];
|
||||
u8 args[4], *argbuf = NULL, *sensebuf = NULL;
|
||||
u8 args[4], *argbuf = NULL;
|
||||
int argsize = 0;
|
||||
enum dma_data_direction data_dir;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
if (copy_from_user(args, arg, sizeof(args)))
|
||||
return -EFAULT;
|
||||
|
||||
sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
|
||||
if (!sensebuf)
|
||||
return -ENOMEM;
|
||||
|
||||
memset(sensebuf, 0, sizeof(sensebuf));
|
||||
memset(scsi_cmd, 0, sizeof(scsi_cmd));
|
||||
|
||||
if (args[3]) {
|
||||
@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
&& copy_to_user(arg + sizeof(args), argbuf, argsize))
|
||||
rc = -EFAULT;
|
||||
error:
|
||||
kfree(sensebuf);
|
||||
kfree(argbuf);
|
||||
return rc;
|
||||
}
|
||||
@ -704,8 +701,9 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
{
|
||||
int rc = 0;
|
||||
u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
|
||||
u8 scsi_cmd[MAX_COMMAND_SIZE];
|
||||
u8 args[7], *sensebuf = NULL;
|
||||
u8 args[7];
|
||||
struct scsi_sense_hdr sshdr;
|
||||
int cmd_result;
|
||||
|
||||
@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
if (copy_from_user(args, arg, sizeof(args)))
|
||||
return -EFAULT;
|
||||
|
||||
sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
|
||||
if (!sensebuf)
|
||||
return -ENOMEM;
|
||||
|
||||
memset(sensebuf, 0, sizeof(sensebuf));
|
||||
memset(scsi_cmd, 0, sizeof(scsi_cmd));
|
||||
scsi_cmd[0] = ATA_16;
|
||||
scsi_cmd[1] = (3 << 1); /* Non-data */
|
||||
@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
|
||||
}
|
||||
|
||||
error:
|
||||
kfree(sensebuf);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#define DAC960_DriverDate "21 Aug 2007"
|
||||
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/miscdevice.h>
|
||||
@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
|
||||
return true;
|
||||
}
|
||||
|
||||
static int dac960_proc_show(struct seq_file *m, void *v)
|
||||
static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v)
|
||||
{
|
||||
unsigned char *StatusMessage = "OK\n";
|
||||
int ControllerNumber;
|
||||
@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
|
||||
static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m,
|
||||
void *v)
|
||||
{
|
||||
DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
|
||||
seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dac960_current_status_proc_show(struct seq_file *m, void *v)
|
||||
static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m,
|
||||
void *v)
|
||||
{
|
||||
DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
|
||||
unsigned char *StatusMessage =
|
||||
|
@ -74,12 +74,12 @@ config AMIGA_Z2RAM
|
||||
|
||||
config CDROM
|
||||
tristate
|
||||
select BLK_SCSI_REQUEST
|
||||
|
||||
config GDROM
|
||||
tristate "SEGA Dreamcast GD-ROM drive"
|
||||
depends on SH_DREAMCAST
|
||||
select CDROM
|
||||
select BLK_SCSI_REQUEST # only for the generic cdrom code
|
||||
help
|
||||
A standard SEGA Dreamcast comes with a modified CD ROM drive called a
|
||||
"GD-ROM" by SEGA to signify it is capable of reading special disks
|
||||
|
@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
|
||||
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
|
||||
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
|
||||
obj-$(CONFIG_ZRAM) += zram/
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
|
||||
null_blk-objs := null_blk_main.o
|
||||
null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
|
||||
|
||||
skd-y := skd_main.o
|
||||
swim_mod-y := swim.o swim_asm.o
|
||||
|
@ -1137,6 +1137,7 @@ noskb: if (buf)
|
||||
break;
|
||||
}
|
||||
bvcpy(skb, f->buf->bio, f->iter, n);
|
||||
/* fall through */
|
||||
case ATA_CMD_PIO_WRITE:
|
||||
case ATA_CMD_PIO_WRITE_EXT:
|
||||
spin_lock_irq(&d->lock);
|
||||
|
@ -284,7 +284,7 @@ freedev(struct aoedev *d)
|
||||
e = t + d->ntargets;
|
||||
for (; t < e && *t; t++)
|
||||
freetgt(d, *t);
|
||||
if (d->bufpool)
|
||||
|
||||
mempool_destroy(d->bufpool);
|
||||
skbpoolfree(d);
|
||||
minor_free(d->sysminor);
|
||||
|
@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
|
||||
* Process a single bvec of a bio.
|
||||
*/
|
||||
static int brd_do_bvec(struct brd_device *brd, struct page *page,
|
||||
unsigned int len, unsigned int off, bool is_write,
|
||||
unsigned int len, unsigned int off, unsigned int op,
|
||||
sector_t sector)
|
||||
{
|
||||
void *mem;
|
||||
int err = 0;
|
||||
|
||||
if (is_write) {
|
||||
if (op_is_write(op)) {
|
||||
err = copy_to_brd_setup(brd, sector, len);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mem = kmap_atomic(page);
|
||||
if (!is_write) {
|
||||
if (!op_is_write(op)) {
|
||||
copy_from_brd(mem + off, brd, sector, len);
|
||||
flush_dcache_page(page);
|
||||
} else {
|
||||
@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
|
||||
int err;
|
||||
|
||||
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
|
||||
op_is_write(bio_op(bio)), sector);
|
||||
bio_op(bio), sector);
|
||||
if (err)
|
||||
goto io_error;
|
||||
sector += len >> SECTOR_SHIFT;
|
||||
@ -310,15 +310,15 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
|
||||
}
|
||||
|
||||
static int brd_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, bool is_write)
|
||||
struct page *page, unsigned int op)
|
||||
{
|
||||
struct brd_device *brd = bdev->bd_disk->private_data;
|
||||
int err;
|
||||
|
||||
if (PageTransHuge(page))
|
||||
return -ENOTSUPP;
|
||||
err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
|
||||
page_endio(page, is_write, err);
|
||||
err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
|
||||
page_endio(page, op_is_write(op), err);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -55,12 +55,10 @@
|
||||
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
|
||||
# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
|
||||
# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
|
||||
# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
|
||||
#else
|
||||
# define __protected_by(x)
|
||||
# define __protected_read_by(x)
|
||||
# define __protected_write_by(x)
|
||||
# define __must_hold(x)
|
||||
#endif
|
||||
|
||||
/* shared module parameters, defined in drbd_main.c */
|
||||
|
@ -2103,13 +2103,9 @@ static void drbd_destroy_mempools(void)
|
||||
mempool_exit(&drbd_md_io_page_pool);
|
||||
mempool_exit(&drbd_ee_mempool);
|
||||
mempool_exit(&drbd_request_mempool);
|
||||
if (drbd_ee_cache)
|
||||
kmem_cache_destroy(drbd_ee_cache);
|
||||
if (drbd_request_cache)
|
||||
kmem_cache_destroy(drbd_request_cache);
|
||||
if (drbd_bm_ext_cache)
|
||||
kmem_cache_destroy(drbd_bm_ext_cache);
|
||||
if (drbd_al_ext_cache)
|
||||
kmem_cache_destroy(drbd_al_ext_cache);
|
||||
|
||||
drbd_ee_cache = NULL;
|
||||
|
@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
|
||||
if (c_min_rate == 0)
|
||||
return false;
|
||||
|
||||
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
|
||||
(int)part_stat_read(&disk->part0, sectors[1]) -
|
||||
curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
|
||||
atomic_read(&device->rs_sect_ev);
|
||||
|
||||
if (atomic_read(&device->ap_actlog_cnt)
|
||||
@ -2790,6 +2789,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
then we would do something smarter here than reading
|
||||
the block... */
|
||||
peer_req->flags |= EE_RS_THIN_REQ;
|
||||
/* fall through */
|
||||
case P_RS_DATA_REQUEST:
|
||||
peer_req->w.cb = w_e_end_rsdata_req;
|
||||
fault_type = DRBD_FAULT_RS_RD;
|
||||
@ -2968,6 +2968,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
|
||||
/* Else fall through to one of the other strategies... */
|
||||
drbd_warn(device, "Discard younger/older primary did not find a decision\n"
|
||||
"Using discard-least-changes instead\n");
|
||||
/* fall through */
|
||||
case ASB_DISCARD_ZERO_CHG:
|
||||
if (ch_peer == 0 && ch_self == 0) {
|
||||
rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
|
||||
@ -2979,6 +2980,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
|
||||
}
|
||||
if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
|
||||
break;
|
||||
/* else: fall through */
|
||||
case ASB_DISCARD_LEAST_CHG:
|
||||
if (ch_self < ch_peer)
|
||||
rv = -1;
|
||||
|
@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
|
||||
{
|
||||
struct request_queue *q = device->rq_queue;
|
||||
|
||||
generic_start_io_acct(q, bio_data_dir(req->master_bio),
|
||||
generic_start_io_acct(q, bio_op(req->master_bio),
|
||||
req->i.size >> 9, &device->vdisk->part0);
|
||||
}
|
||||
|
||||
@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
|
||||
{
|
||||
struct request_queue *q = device->rq_queue;
|
||||
|
||||
generic_end_io_acct(q, bio_data_dir(req->master_bio),
|
||||
generic_end_io_acct(q, bio_op(req->master_bio),
|
||||
&device->vdisk->part0, req->start_jif);
|
||||
}
|
||||
|
||||
|
@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
|
||||
atomic_set(&device->rs_sect_in, 0);
|
||||
atomic_set(&device->rs_sect_ev, 0);
|
||||
device->rs_in_flight = 0;
|
||||
device->rs_last_events =
|
||||
(int)part_stat_read(&disk->part0, sectors[0]) +
|
||||
(int)part_stat_read(&disk->part0, sectors[1]);
|
||||
device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
|
||||
|
||||
/* Updating the RCU protected object in place is necessary since
|
||||
this function gets called from atomic context.
|
||||
|
@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void)
|
||||
int i;
|
||||
int r;
|
||||
int flags;
|
||||
int dflags;
|
||||
unsigned long ready_date;
|
||||
void (*function)(void);
|
||||
|
||||
@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void)
|
||||
if (fd_wait_for_completion(ready_date, function))
|
||||
return;
|
||||
}
|
||||
dflags = DRS->flags;
|
||||
|
||||
if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
|
||||
setup_DMA();
|
||||
|
||||
|
@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
|
||||
unsigned int arg)
|
||||
{
|
||||
struct file *file, *old_file;
|
||||
struct inode *inode;
|
||||
int error;
|
||||
|
||||
error = -ENXIO;
|
||||
@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
|
||||
if (error)
|
||||
goto out_putf;
|
||||
|
||||
inode = file->f_mapping->host;
|
||||
old_file = lo->lo_backing_file;
|
||||
|
||||
error = -EINVAL;
|
||||
@ -1611,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
case LOOP_GET_STATUS64:
|
||||
case LOOP_SET_STATUS64:
|
||||
arg = (unsigned long) compat_ptr(arg);
|
||||
/* fall through */
|
||||
case LOOP_SET_FD:
|
||||
case LOOP_CHANGE_FD:
|
||||
case LOOP_SET_BLOCK_SIZE:
|
||||
|
@ -2575,7 +2575,6 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
|
||||
|
||||
static void mtip_hw_debugfs_exit(struct driver_data *dd)
|
||||
{
|
||||
if (dd->dfs_node)
|
||||
debugfs_remove_recursive(dd->dfs_node);
|
||||
}
|
||||
|
||||
|
108
drivers/block/null_blk.h
Normal file
108
drivers/block/null_blk.h
Normal file
@ -0,0 +1,108 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __BLK_NULL_BLK_H
|
||||
#define __BLK_NULL_BLK_H
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/configfs.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/fault-inject.h>
|
||||
|
||||
struct nullb_cmd {
|
||||
struct list_head list;
|
||||
struct llist_node ll_list;
|
||||
struct __call_single_data csd;
|
||||
struct request *rq;
|
||||
struct bio *bio;
|
||||
unsigned int tag;
|
||||
blk_status_t error;
|
||||
struct nullb_queue *nq;
|
||||
struct hrtimer timer;
|
||||
};
|
||||
|
||||
struct nullb_queue {
|
||||
unsigned long *tag_map;
|
||||
wait_queue_head_t wait;
|
||||
unsigned int queue_depth;
|
||||
struct nullb_device *dev;
|
||||
unsigned int requeue_selection;
|
||||
|
||||
struct nullb_cmd *cmds;
|
||||
};
|
||||
|
||||
struct nullb_device {
|
||||
struct nullb *nullb;
|
||||
struct config_item item;
|
||||
struct radix_tree_root data; /* data stored in the disk */
|
||||
struct radix_tree_root cache; /* disk cache data */
|
||||
unsigned long flags; /* device flags */
|
||||
unsigned int curr_cache;
|
||||
struct badblocks badblocks;
|
||||
|
||||
unsigned int nr_zones;
|
||||
struct blk_zone *zones;
|
||||
sector_t zone_size_sects;
|
||||
|
||||
unsigned long size; /* device size in MB */
|
||||
unsigned long completion_nsec; /* time in ns to complete a request */
|
||||
unsigned long cache_size; /* disk cache size in MB */
|
||||
unsigned long zone_size; /* zone size in MB if device is zoned */
|
||||
unsigned int submit_queues; /* number of submission queues */
|
||||
unsigned int home_node; /* home node for the device */
|
||||
unsigned int queue_mode; /* block interface */
|
||||
unsigned int blocksize; /* block size */
|
||||
unsigned int irqmode; /* IRQ completion handler */
|
||||
unsigned int hw_queue_depth; /* queue depth */
|
||||
unsigned int index; /* index of the disk, only valid with a disk */
|
||||
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
|
||||
bool blocking; /* blocking blk-mq device */
|
||||
bool use_per_node_hctx; /* use per-node allocation for hardware context */
|
||||
bool power; /* power on/off the device */
|
||||
bool memory_backed; /* if data is stored in memory */
|
||||
bool discard; /* if support discard */
|
||||
bool zoned; /* if device is zoned */
|
||||
};
|
||||
|
||||
struct nullb {
|
||||
struct nullb_device *dev;
|
||||
struct list_head list;
|
||||
unsigned int index;
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
struct blk_mq_tag_set *tag_set;
|
||||
struct blk_mq_tag_set __tag_set;
|
||||
unsigned int queue_depth;
|
||||
atomic_long_t cur_bytes;
|
||||
struct hrtimer bw_timer;
|
||||
unsigned long cache_flush_pos;
|
||||
spinlock_t lock;
|
||||
|
||||
struct nullb_queue *queues;
|
||||
unsigned int nr_queues;
|
||||
char disk_name[DISK_NAME_LEN];
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
int null_zone_init(struct nullb_device *dev);
|
||||
void null_zone_exit(struct nullb_device *dev);
|
||||
blk_status_t null_zone_report(struct nullb *nullb,
|
||||
struct nullb_cmd *cmd);
|
||||
void null_zone_write(struct nullb_cmd *cmd);
|
||||
void null_zone_reset(struct nullb_cmd *cmd);
|
||||
#else
|
||||
static inline int null_zone_init(struct nullb_device *dev)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
static inline void null_zone_exit(struct nullb_device *dev) {}
|
||||
static inline blk_status_t null_zone_report(struct nullb *nullb,
|
||||
struct nullb_cmd *cmd)
|
||||
{
|
||||
return BLK_STS_NOTSUPP;
|
||||
}
|
||||
static inline void null_zone_write(struct nullb_cmd *cmd) {}
|
||||
static inline void null_zone_reset(struct nullb_cmd *cmd) {}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
#endif /* __NULL_BLK_H */
|
@ -7,14 +7,8 @@
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/configfs.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/fault-inject.h>
|
||||
#include "null_blk.h"
|
||||
|
||||
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
|
||||
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
|
||||
@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps)
|
||||
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
|
||||
}
|
||||
|
||||
struct nullb_cmd {
|
||||
struct list_head list;
|
||||
struct llist_node ll_list;
|
||||
struct __call_single_data csd;
|
||||
struct request *rq;
|
||||
struct bio *bio;
|
||||
unsigned int tag;
|
||||
blk_status_t error;
|
||||
struct nullb_queue *nq;
|
||||
struct hrtimer timer;
|
||||
};
|
||||
|
||||
struct nullb_queue {
|
||||
unsigned long *tag_map;
|
||||
wait_queue_head_t wait;
|
||||
unsigned int queue_depth;
|
||||
struct nullb_device *dev;
|
||||
unsigned int requeue_selection;
|
||||
|
||||
struct nullb_cmd *cmds;
|
||||
};
|
||||
|
||||
/*
|
||||
* Status flags for nullb_device.
|
||||
*
|
||||
@ -92,52 +64,6 @@ struct nullb_page {
|
||||
#define NULLB_PAGE_LOCK (MAP_SZ - 1)
|
||||
#define NULLB_PAGE_FREE (MAP_SZ - 2)
|
||||
|
||||
struct nullb_device {
|
||||
struct nullb *nullb;
|
||||
struct config_item item;
|
||||
struct radix_tree_root data; /* data stored in the disk */
|
||||
struct radix_tree_root cache; /* disk cache data */
|
||||
unsigned long flags; /* device flags */
|
||||
unsigned int curr_cache;
|
||||
struct badblocks badblocks;
|
||||
|
||||
unsigned long size; /* device size in MB */
|
||||
unsigned long completion_nsec; /* time in ns to complete a request */
|
||||
unsigned long cache_size; /* disk cache size in MB */
|
||||
unsigned int submit_queues; /* number of submission queues */
|
||||
unsigned int home_node; /* home node for the device */
|
||||
unsigned int queue_mode; /* block interface */
|
||||
unsigned int blocksize; /* block size */
|
||||
unsigned int irqmode; /* IRQ completion handler */
|
||||
unsigned int hw_queue_depth; /* queue depth */
|
||||
unsigned int index; /* index of the disk, only valid with a disk */
|
||||
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
|
||||
bool blocking; /* blocking blk-mq device */
|
||||
bool use_per_node_hctx; /* use per-node allocation for hardware context */
|
||||
bool power; /* power on/off the device */
|
||||
bool memory_backed; /* if data is stored in memory */
|
||||
bool discard; /* if support discard */
|
||||
};
|
||||
|
||||
struct nullb {
|
||||
struct nullb_device *dev;
|
||||
struct list_head list;
|
||||
unsigned int index;
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
struct blk_mq_tag_set *tag_set;
|
||||
struct blk_mq_tag_set __tag_set;
|
||||
unsigned int queue_depth;
|
||||
atomic_long_t cur_bytes;
|
||||
struct hrtimer bw_timer;
|
||||
unsigned long cache_flush_pos;
|
||||
spinlock_t lock;
|
||||
|
||||
struct nullb_queue *queues;
|
||||
unsigned int nr_queues;
|
||||
char disk_name[DISK_NAME_LEN];
|
||||
};
|
||||
|
||||
static LIST_HEAD(nullb_list);
|
||||
static struct mutex lock;
|
||||
static int null_major;
|
||||
@ -254,6 +180,14 @@ static bool g_use_per_node_hctx;
|
||||
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
|
||||
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
|
||||
|
||||
static bool g_zoned;
|
||||
module_param_named(zoned, g_zoned, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
|
||||
|
||||
static unsigned long g_zone_size = 256;
|
||||
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
|
||||
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
|
||||
|
||||
static struct nullb_device *null_alloc_dev(void);
|
||||
static void null_free_dev(struct nullb_device *dev);
|
||||
static void null_del_dev(struct nullb *nullb);
|
||||
@ -357,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool);
|
||||
NULLB_DEVICE_ATTR(discard, bool);
|
||||
NULLB_DEVICE_ATTR(mbps, uint);
|
||||
NULLB_DEVICE_ATTR(cache_size, ulong);
|
||||
NULLB_DEVICE_ATTR(zoned, bool);
|
||||
NULLB_DEVICE_ATTR(zone_size, ulong);
|
||||
|
||||
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
|
||||
{
|
||||
@ -390,6 +326,7 @@ static ssize_t nullb_device_power_store(struct config_item *item,
|
||||
null_del_dev(dev->nullb);
|
||||
mutex_unlock(&lock);
|
||||
clear_bit(NULLB_DEV_FL_UP, &dev->flags);
|
||||
clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
|
||||
}
|
||||
|
||||
return count;
|
||||
@ -468,6 +405,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_mbps,
|
||||
&nullb_device_attr_cache_size,
|
||||
&nullb_device_attr_badblocks,
|
||||
&nullb_device_attr_zoned,
|
||||
&nullb_device_attr_zone_size,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -520,7 +459,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
|
||||
|
||||
static ssize_t memb_group_features_show(struct config_item *item, char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
|
||||
return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
|
||||
}
|
||||
|
||||
CONFIGFS_ATTR_RO(memb_group_, features);
|
||||
@ -579,6 +518,8 @@ static struct nullb_device *null_alloc_dev(void)
|
||||
dev->hw_queue_depth = g_hw_queue_depth;
|
||||
dev->blocking = g_blocking;
|
||||
dev->use_per_node_hctx = g_use_per_node_hctx;
|
||||
dev->zoned = g_zoned;
|
||||
dev->zone_size = g_zone_size;
|
||||
return dev;
|
||||
}
|
||||
|
||||
@ -587,6 +528,7 @@ static void null_free_dev(struct nullb_device *dev)
|
||||
if (!dev)
|
||||
return;
|
||||
|
||||
null_zone_exit(dev);
|
||||
badblocks_exit(&dev->badblocks);
|
||||
kfree(dev);
|
||||
}
|
||||
@ -863,6 +805,8 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb,
|
||||
|
||||
static struct nullb_page *null_insert_page(struct nullb *nullb,
|
||||
sector_t sector, bool ignore_cache)
|
||||
__releases(&nullb->lock)
|
||||
__acquires(&nullb->lock)
|
||||
{
|
||||
u64 idx;
|
||||
struct nullb_page *t_page;
|
||||
@ -1219,6 +1163,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
|
||||
struct nullb *nullb = dev->nullb;
|
||||
int err = 0;
|
||||
|
||||
if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
|
||||
cmd->error = null_zone_report(nullb, cmd);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
|
||||
struct request *rq = cmd->rq;
|
||||
|
||||
@ -1283,6 +1232,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
|
||||
}
|
||||
}
|
||||
cmd->error = errno_to_blk_status(err);
|
||||
|
||||
if (!cmd->error && dev->zoned) {
|
||||
if (req_op(cmd->rq) == REQ_OP_WRITE)
|
||||
null_zone_write(cmd);
|
||||
else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
|
||||
null_zone_reset(cmd);
|
||||
}
|
||||
out:
|
||||
/* Complete IO by inline, softirq or timer */
|
||||
switch (dev->irqmode) {
|
||||
@ -1810,6 +1766,15 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
blk_queue_flush_queueable(nullb->q, true);
|
||||
}
|
||||
|
||||
if (dev->zoned) {
|
||||
rv = null_zone_init(dev);
|
||||
if (rv)
|
||||
goto out_cleanup_blk_queue;
|
||||
|
||||
blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
|
||||
nullb->q->limits.zoned = BLK_ZONED_HM;
|
||||
}
|
||||
|
||||
nullb->q->queuedata = nullb;
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
|
||||
@ -1828,13 +1793,16 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
|
||||
rv = null_gendisk_register(nullb);
|
||||
if (rv)
|
||||
goto out_cleanup_blk_queue;
|
||||
goto out_cleanup_zone;
|
||||
|
||||
mutex_lock(&lock);
|
||||
list_add_tail(&nullb->list, &nullb_list);
|
||||
mutex_unlock(&lock);
|
||||
|
||||
return 0;
|
||||
out_cleanup_zone:
|
||||
if (dev->zoned)
|
||||
null_zone_exit(dev);
|
||||
out_cleanup_blk_queue:
|
||||
blk_cleanup_queue(nullb->q);
|
||||
out_cleanup_tags:
|
||||
@ -1861,6 +1829,11 @@ static int __init null_init(void)
|
||||
g_bs = PAGE_SIZE;
|
||||
}
|
||||
|
||||
if (!is_power_of_2(g_zone_size)) {
|
||||
pr_err("null_blk: zone_size must be power-of-two\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
|
||||
if (g_submit_queues != nr_online_nodes) {
|
||||
pr_warn("null_blk: submit_queues param is set to %u.\n",
|
149
drivers/block/null_blk_zoned.c
Normal file
149
drivers/block/null_blk_zoned.c
Normal file
@ -0,0 +1,149 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/vmalloc.h>
|
||||
#include "null_blk.h"
|
||||
|
||||
/* zone_size in MBs to sectors. */
|
||||
#define ZONE_SIZE_SHIFT 11
|
||||
|
||||
static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
|
||||
{
|
||||
return sect >> ilog2(dev->zone_size_sects);
|
||||
}
|
||||
|
||||
int null_zone_init(struct nullb_device *dev)
|
||||
{
|
||||
sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
|
||||
sector_t sector = 0;
|
||||
unsigned int i;
|
||||
|
||||
if (!is_power_of_2(dev->zone_size)) {
|
||||
pr_err("null_blk: zone_size must be power-of-two\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
|
||||
dev->nr_zones = dev_size >>
|
||||
(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
|
||||
dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
|
||||
GFP_KERNEL | __GFP_ZERO);
|
||||
if (!dev->zones)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < dev->nr_zones; i++) {
|
||||
struct blk_zone *zone = &dev->zones[i];
|
||||
|
||||
zone->start = zone->wp = sector;
|
||||
zone->len = dev->zone_size_sects;
|
||||
zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
|
||||
sector += dev->zone_size_sects;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void null_zone_exit(struct nullb_device *dev)
|
||||
{
|
||||
kvfree(dev->zones);
|
||||
}
|
||||
|
||||
static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
|
||||
unsigned int zno, unsigned int nr_zones)
|
||||
{
|
||||
struct blk_zone_report_hdr *hdr = NULL;
|
||||
struct bio_vec bvec;
|
||||
struct bvec_iter iter;
|
||||
void *addr;
|
||||
unsigned int zones_to_cpy;
|
||||
|
||||
bio_for_each_segment(bvec, rq->bio, iter) {
|
||||
addr = kmap_atomic(bvec.bv_page);
|
||||
|
||||
zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
|
||||
|
||||
if (!hdr) {
|
||||
hdr = (struct blk_zone_report_hdr *)addr;
|
||||
hdr->nr_zones = nr_zones;
|
||||
zones_to_cpy--;
|
||||
addr += sizeof(struct blk_zone_report_hdr);
|
||||
}
|
||||
|
||||
zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
|
||||
|
||||
memcpy(addr, &dev->zones[zno],
|
||||
zones_to_cpy * sizeof(struct blk_zone));
|
||||
|
||||
kunmap_atomic(addr);
|
||||
|
||||
nr_zones -= zones_to_cpy;
|
||||
zno += zones_to_cpy;
|
||||
|
||||
if (!nr_zones)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
blk_status_t null_zone_report(struct nullb *nullb,
|
||||
struct nullb_cmd *cmd)
|
||||
{
|
||||
struct nullb_device *dev = nullb->dev;
|
||||
struct request *rq = cmd->rq;
|
||||
unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
|
||||
unsigned int nr_zones = dev->nr_zones - zno;
|
||||
unsigned int max_zones = (blk_rq_bytes(rq) /
|
||||
sizeof(struct blk_zone)) - 1;
|
||||
|
||||
nr_zones = min_t(unsigned int, nr_zones, max_zones);
|
||||
|
||||
null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
void null_zone_write(struct nullb_cmd *cmd)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
struct request *rq = cmd->rq;
|
||||
sector_t sector = blk_rq_pos(rq);
|
||||
unsigned int rq_sectors = blk_rq_sectors(rq);
|
||||
unsigned int zno = null_zone_no(dev, sector);
|
||||
struct blk_zone *zone = &dev->zones[zno];
|
||||
|
||||
switch (zone->cond) {
|
||||
case BLK_ZONE_COND_FULL:
|
||||
/* Cannot write to a full zone */
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
case BLK_ZONE_COND_IMP_OPEN:
|
||||
/* Writes must be at the write pointer position */
|
||||
if (blk_rq_pos(rq) != zone->wp) {
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
}
|
||||
|
||||
if (zone->cond == BLK_ZONE_COND_EMPTY)
|
||||
zone->cond = BLK_ZONE_COND_IMP_OPEN;
|
||||
|
||||
zone->wp += rq_sectors;
|
||||
if (zone->wp == zone->start + zone->len)
|
||||
zone->cond = BLK_ZONE_COND_FULL;
|
||||
break;
|
||||
default:
|
||||
/* Invalid zone condition */
|
||||
cmd->error = BLK_STS_IOERR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void null_zone_reset(struct nullb_cmd *cmd)
|
||||
{
|
||||
struct nullb_device *dev = cmd->nq->dev;
|
||||
struct request *rq = cmd->rq;
|
||||
unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
|
||||
struct blk_zone *zone = &dev->zones[zno];
|
||||
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
zone->wp = zone->start;
|
||||
}
|
@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
|
||||
|
||||
static void bpck_read_eeprom ( PIA *pi, char * buf )
|
||||
|
||||
{ int i,j,k,n,p,v,f, om, od;
|
||||
{ int i, j, k, p, v, f, om, od;
|
||||
|
||||
bpck_force_spp(pi);
|
||||
|
||||
@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf )
|
||||
|
||||
bpck_connect(pi);
|
||||
|
||||
n = 0;
|
||||
WR(4,0);
|
||||
for (i=0;i<64;i++) {
|
||||
WR(6,8);
|
||||
|
@ -426,6 +426,7 @@ static void run_fsm(void)
|
||||
pd_claimed = 1;
|
||||
if (!pi_schedule_claimed(pi_current, run_fsm))
|
||||
return;
|
||||
/* fall through */
|
||||
case 1:
|
||||
pd_claimed = 2;
|
||||
pi_current->proto->connect(pi_current);
|
||||
@ -445,6 +446,7 @@ static void run_fsm(void)
|
||||
spin_unlock_irqrestore(&pd_lock, saved_flags);
|
||||
if (stop)
|
||||
return;
|
||||
/* fall through */
|
||||
case Hold:
|
||||
schedule_fsm();
|
||||
return;
|
||||
|
@ -67,7 +67,7 @@
|
||||
#include <scsi/scsi.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/device.h>
|
||||
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#define DRIVER_NAME "pktcdvd"
|
||||
@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index)
|
||||
static void pkt_dump_sense(struct pktcdvd_device *pd,
|
||||
struct packet_command *cgc)
|
||||
{
|
||||
struct request_sense *sense = cgc->sense;
|
||||
struct scsi_sense_hdr *sshdr = cgc->sshdr;
|
||||
|
||||
if (sense)
|
||||
if (sshdr)
|
||||
pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
|
||||
CDROM_PACKET_SIZE, cgc->cmd,
|
||||
sense->sense_key, sense->asc, sense->ascq,
|
||||
sense_key_string(sense->sense_key));
|
||||
sshdr->sense_key, sshdr->asc, sshdr->ascq,
|
||||
sense_key_string(sshdr->sense_key));
|
||||
else
|
||||
pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
|
||||
}
|
||||
@ -787,18 +787,19 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
|
||||
unsigned write_speed, unsigned read_speed)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
int ret;
|
||||
|
||||
init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
cgc.cmd[0] = GPCMD_SET_SPEED;
|
||||
cgc.cmd[2] = (read_speed >> 8) & 0xff;
|
||||
cgc.cmd[3] = read_speed & 0xff;
|
||||
cgc.cmd[4] = (write_speed >> 8) & 0xff;
|
||||
cgc.cmd[5] = write_speed & 0xff;
|
||||
|
||||
if ((ret = pkt_generic_packet(pd, &cgc)))
|
||||
ret = pkt_generic_packet(pd, &cgc);
|
||||
if (ret)
|
||||
pkt_dump_sense(pd, &cgc);
|
||||
|
||||
return ret;
|
||||
@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
|
||||
cgc.cmd[8] = cgc.buflen = 2;
|
||||
cgc.quiet = 1;
|
||||
|
||||
if ((ret = pkt_generic_packet(pd, &cgc)))
|
||||
ret = pkt_generic_packet(pd, &cgc);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* not all drives have the same disc_info length, so requeue
|
||||
@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type,
|
||||
cgc.cmd[8] = 8;
|
||||
cgc.quiet = 1;
|
||||
|
||||
if ((ret = pkt_generic_packet(pd, &cgc)))
|
||||
ret = pkt_generic_packet(pd, &cgc);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
cgc.buflen = be16_to_cpu(ti->track_information_length) +
|
||||
@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
|
||||
__u32 last_track;
|
||||
int ret = -1;
|
||||
|
||||
if ((ret = pkt_get_disc_info(pd, &di)))
|
||||
ret = pkt_get_disc_info(pd, &di);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
last_track = (di.last_track_msb << 8) | di.last_track_lsb;
|
||||
if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
|
||||
ret = pkt_get_track_info(pd, last_track, 1, &ti);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* if this track is blank, try the previous. */
|
||||
if (ti.blank) {
|
||||
last_track--;
|
||||
if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
|
||||
ret = pkt_get_track_info(pd, last_track, 1, &ti);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1645,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
|
||||
static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
write_param_page *wp;
|
||||
char buffer[128];
|
||||
int ret, size;
|
||||
@ -1656,8 +1662,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
|
||||
|
||||
memset(buffer, 0, sizeof(buffer));
|
||||
init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
|
||||
cgc.sense = &sense;
|
||||
if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
|
||||
cgc.sshdr = &sshdr;
|
||||
ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
|
||||
if (ret) {
|
||||
pkt_dump_sense(pd, &cgc);
|
||||
return ret;
|
||||
}
|
||||
@ -1671,8 +1678,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
|
||||
* now get it all
|
||||
*/
|
||||
init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
|
||||
cgc.sense = &sense;
|
||||
if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
|
||||
cgc.sshdr = &sshdr;
|
||||
ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
|
||||
if (ret) {
|
||||
pkt_dump_sense(pd, &cgc);
|
||||
return ret;
|
||||
}
|
||||
@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
|
||||
wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
|
||||
|
||||
cgc.buflen = cgc.cmd[8] = size;
|
||||
if ((ret = pkt_mode_select(pd, &cgc))) {
|
||||
ret = pkt_mode_select(pd, &cgc);
|
||||
if (ret) {
|
||||
pkt_dump_sense(pd, &cgc);
|
||||
return ret;
|
||||
}
|
||||
@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
|
||||
memset(&di, 0, sizeof(disc_information));
|
||||
memset(&ti, 0, sizeof(track_information));
|
||||
|
||||
if ((ret = pkt_get_disc_info(pd, &di))) {
|
||||
ret = pkt_get_disc_info(pd, &di);
|
||||
if (ret) {
|
||||
pkt_err(pd, "failed get_disc\n");
|
||||
return ret;
|
||||
}
|
||||
@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
|
||||
pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
|
||||
|
||||
track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
|
||||
if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
|
||||
ret = pkt_get_track_info(pd, track, 1, &ti);
|
||||
if (ret) {
|
||||
pkt_err(pd, "failed get_track\n");
|
||||
return ret;
|
||||
}
|
||||
@ -1905,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
|
||||
int set)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
unsigned char buf[64];
|
||||
int ret;
|
||||
|
||||
init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
cgc.buflen = pd->mode_offset + 12;
|
||||
|
||||
/*
|
||||
@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
|
||||
*/
|
||||
cgc.quiet = 1;
|
||||
|
||||
if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
|
||||
ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
buf[pd->mode_offset + 10] |= (!!set << 2);
|
||||
@ -1950,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
|
||||
unsigned *write_speed)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
unsigned char buf[256+18];
|
||||
unsigned char *cap_buf;
|
||||
int ret, offset;
|
||||
|
||||
cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
|
||||
init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
|
||||
ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
|
||||
if (ret) {
|
||||
@ -2011,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
|
||||
unsigned *speed)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
unsigned char buf[64];
|
||||
unsigned int size, st, sp;
|
||||
int ret;
|
||||
|
||||
init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
|
||||
cgc.cmd[1] = 2;
|
||||
cgc.cmd[2] = 4; /* READ ATIP */
|
||||
@ -2032,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
|
||||
size = sizeof(buf);
|
||||
|
||||
init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
|
||||
cgc.cmd[1] = 2;
|
||||
cgc.cmd[2] = 4;
|
||||
@ -2083,17 +2095,18 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
|
||||
static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
|
||||
{
|
||||
struct packet_command cgc;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
int ret;
|
||||
|
||||
pkt_dbg(2, pd, "Performing OPC\n");
|
||||
|
||||
init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
|
||||
cgc.sense = &sense;
|
||||
cgc.sshdr = &sshdr;
|
||||
cgc.timeout = 60*HZ;
|
||||
cgc.cmd[0] = GPCMD_SEND_OPC;
|
||||
cgc.cmd[1] = 1;
|
||||
if ((ret = pkt_generic_packet(pd, &cgc)))
|
||||
ret = pkt_generic_packet(pd, &cgc);
|
||||
if (ret)
|
||||
pkt_dump_sense(pd, &cgc);
|
||||
return ret;
|
||||
}
|
||||
@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd)
|
||||
int ret;
|
||||
unsigned int write_speed, media_write_speed, read_speed;
|
||||
|
||||
if ((ret = pkt_probe_settings(pd))) {
|
||||
ret = pkt_probe_settings(pd);
|
||||
if (ret) {
|
||||
pkt_dbg(2, pd, "failed probe\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if ((ret = pkt_set_write_settings(pd))) {
|
||||
ret = pkt_set_write_settings(pd);
|
||||
if (ret) {
|
||||
pkt_dbg(1, pd, "failed saving write settings\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
pkt_write_caching(pd, USE_WCACHING);
|
||||
|
||||
if ((ret = pkt_get_max_speed(pd, &write_speed)))
|
||||
ret = pkt_get_max_speed(pd, &write_speed);
|
||||
if (ret)
|
||||
write_speed = 16 * 177;
|
||||
switch (pd->mmc3_profile) {
|
||||
case 0x13: /* DVD-RW */
|
||||
@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd)
|
||||
pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
|
||||
break;
|
||||
default:
|
||||
if ((ret = pkt_media_speed(pd, &media_write_speed)))
|
||||
ret = pkt_media_speed(pd, &media_write_speed);
|
||||
if (ret)
|
||||
media_write_speed = 16;
|
||||
write_speed = min(write_speed, media_write_speed * 177);
|
||||
pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
|
||||
@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd)
|
||||
}
|
||||
read_speed = write_speed;
|
||||
|
||||
if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
|
||||
ret = pkt_set_speed(pd, write_speed, read_speed);
|
||||
if (ret) {
|
||||
pkt_dbg(1, pd, "couldn't set write speed\n");
|
||||
return -EIO;
|
||||
}
|
||||
pd->write_speed = write_speed;
|
||||
pd->read_speed = read_speed;
|
||||
|
||||
if ((ret = pkt_perform_opc(pd))) {
|
||||
ret = pkt_perform_opc(pd);
|
||||
if (ret) {
|
||||
pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
|
||||
}
|
||||
|
||||
@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
|
||||
* so bdget() can't fail.
|
||||
*/
|
||||
bdget(pd->bdev->bd_dev);
|
||||
if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
|
||||
ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if ((ret = pkt_get_last_written(pd, &lba))) {
|
||||
ret = pkt_get_last_written(pd, &lba);
|
||||
if (ret) {
|
||||
pkt_err(pd, "pkt_get_last_written failed\n");
|
||||
goto out_putdev;
|
||||
}
|
||||
@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
|
||||
|
||||
q = bdev_get_queue(pd->bdev);
|
||||
if (write) {
|
||||
if ((ret = pkt_open_write(pd)))
|
||||
ret = pkt_open_write(pd);
|
||||
if (ret)
|
||||
goto out_putdev;
|
||||
/*
|
||||
* Some CDRW drives can not handle writes larger than one packet,
|
||||
@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
|
||||
clear_bit(PACKET_WRITABLE, &pd->flags);
|
||||
}
|
||||
|
||||
if ((ret = pkt_set_segment_merging(pd, q)))
|
||||
ret = pkt_set_segment_merging(pd, q);
|
||||
if (ret)
|
||||
goto out_putdev;
|
||||
|
||||
if (write) {
|
||||
@ -2231,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
|
||||
{
|
||||
if (dev_minor >= MAX_WRITERS)
|
||||
return NULL;
|
||||
|
||||
dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
|
||||
return pkt_devs[dev_minor];
|
||||
}
|
||||
|
||||
|
@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
|
||||
|
||||
static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
|
||||
{
|
||||
generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio),
|
||||
generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
|
||||
&card->gendisk->part0);
|
||||
}
|
||||
|
||||
@ -120,7 +120,7 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
|
||||
struct bio *bio,
|
||||
unsigned long start_time)
|
||||
{
|
||||
generic_end_io_acct(card->queue, bio_data_dir(bio),
|
||||
generic_end_io_acct(card->queue, bio_op(bio),
|
||||
&card->gendisk->part0, start_time);
|
||||
}
|
||||
|
||||
|
@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev,
|
||||
|
||||
if (unlikely(skdev->dbg_level > 1)) {
|
||||
dev_dbg(&skdev->pdev->dev,
|
||||
"skreq=%x sksg_list=%p sksg_dma=%llx\n",
|
||||
skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
|
||||
"skreq=%x sksg_list=%p sksg_dma=%pad\n",
|
||||
skreq->id, skreq->sksg_list, &skreq->sksg_dma_address);
|
||||
for (i = 0; i < n_sg; i++) {
|
||||
struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
|
||||
|
||||
@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev,
|
||||
{
|
||||
u64 qcmd;
|
||||
|
||||
dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n",
|
||||
skmsg->mb_dma_address, skd_in_flight(skdev));
|
||||
dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n",
|
||||
&skmsg->mb_dma_address, skd_in_flight(skdev));
|
||||
dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf);
|
||||
|
||||
qcmd = skmsg->mb_dma_address;
|
||||
@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
|
||||
}
|
||||
|
||||
dev_dbg(&skdev->pdev->dev,
|
||||
"skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
|
||||
"skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n",
|
||||
skspcl, skspcl->req.id, skspcl->req.sksg_list,
|
||||
skspcl->req.sksg_dma_address);
|
||||
&skspcl->req.sksg_dma_address);
|
||||
for (i = 0; i < skspcl->req.n_sg; i++) {
|
||||
struct fit_sg_descriptor *sgd =
|
||||
&skspcl->req.sksg_list[i];
|
||||
@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev)
|
||||
|
||||
WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) &
|
||||
(FIT_QCMD_ALIGN - 1),
|
||||
"not aligned: msg_buf %p mb_dma_address %#llx\n",
|
||||
skmsg->msg_buf, skmsg->mb_dma_address);
|
||||
"not aligned: msg_buf %p mb_dma_address %pad\n",
|
||||
skmsg->msg_buf, &skmsg->mb_dma_address);
|
||||
memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
|
||||
}
|
||||
|
||||
|
@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock);
|
||||
#define GRANTS_PER_INDIRECT_FRAME \
|
||||
(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
|
||||
|
||||
#define PSEGS_PER_INDIRECT_FRAME \
|
||||
(GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
|
||||
|
||||
#define INDIRECT_GREFS(_grants) \
|
||||
DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
|
||||
|
||||
#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
|
||||
|
||||
static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
|
||||
static void blkfront_gather_backend_features(struct blkfront_info *info);
|
||||
static int negotiate_mq(struct blkfront_info *info);
|
||||
@ -1441,7 +1436,7 @@ static bool blkif_completion(unsigned long *id,
|
||||
|
||||
/* Wait the second response if not yet here. */
|
||||
if (s2->status == REQ_WAITING)
|
||||
return 0;
|
||||
return false;
|
||||
|
||||
bret->status = blkif_get_final_status(s->status,
|
||||
s2->status);
|
||||
@ -1542,7 +1537,7 @@ static bool blkif_completion(unsigned long *id,
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
|
||||
|
@ -1287,17 +1287,16 @@ static void zram_bio_discard(struct zram *zram, u32 index,
|
||||
* Returns 1 if IO request was successfully submitted.
|
||||
*/
|
||||
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
int offset, bool is_write, struct bio *bio)
|
||||
int offset, unsigned int op, struct bio *bio)
|
||||
{
|
||||
unsigned long start_time = jiffies;
|
||||
int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
|
||||
struct request_queue *q = zram->disk->queue;
|
||||
int ret;
|
||||
|
||||
generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
|
||||
generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
|
||||
&zram->disk->part0);
|
||||
|
||||
if (!is_write) {
|
||||
if (!op_is_write(op)) {
|
||||
atomic64_inc(&zram->stats.num_reads);
|
||||
ret = zram_bvec_read(zram, bvec, index, offset, bio);
|
||||
flush_dcache_page(bvec->bv_page);
|
||||
@ -1306,14 +1305,14 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||
ret = zram_bvec_write(zram, bvec, index, offset, bio);
|
||||
}
|
||||
|
||||
generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
|
||||
generic_end_io_acct(q, op, &zram->disk->part0, start_time);
|
||||
|
||||
zram_slot_lock(zram, index);
|
||||
zram_accessed(zram, index);
|
||||
zram_slot_unlock(zram, index);
|
||||
|
||||
if (unlikely(ret < 0)) {
|
||||
if (!is_write)
|
||||
if (!op_is_write(op))
|
||||
atomic64_inc(&zram->stats.failed_reads);
|
||||
else
|
||||
atomic64_inc(&zram->stats.failed_writes);
|
||||
@ -1351,7 +1350,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
|
||||
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
|
||||
unwritten);
|
||||
if (zram_bvec_rw(zram, &bv, index, offset,
|
||||
op_is_write(bio_op(bio)), bio) < 0)
|
||||
bio_op(bio), bio) < 0)
|
||||
goto out;
|
||||
|
||||
bv.bv_offset += bv.bv_len;
|
||||
@ -1403,7 +1402,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
|
||||
}
|
||||
|
||||
static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
struct page *page, bool is_write)
|
||||
struct page *page, unsigned int op)
|
||||
{
|
||||
int offset, ret;
|
||||
u32 index;
|
||||
@ -1427,7 +1426,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
bv.bv_len = PAGE_SIZE;
|
||||
bv.bv_offset = 0;
|
||||
|
||||
ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
|
||||
ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
|
||||
out:
|
||||
/*
|
||||
* If I/O fails, just return error(ie, non-zero) without
|
||||
@ -1442,7 +1441,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
|
||||
|
||||
switch (ret) {
|
||||
case 0:
|
||||
page_endio(page, is_write, 0);
|
||||
page_endio(page, op_is_write(op), 0);
|
||||
break;
|
||||
case 1:
|
||||
ret = 0;
|
||||
|
@ -282,6 +282,7 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/times.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <scsi/scsi_common.h>
|
||||
#include <scsi/scsi_request.h>
|
||||
|
||||
/* used to tell the module to turn on full debugging messages */
|
||||
@ -345,10 +346,10 @@ static LIST_HEAD(cdrom_list);
|
||||
int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
|
||||
struct packet_command *cgc)
|
||||
{
|
||||
if (cgc->sense) {
|
||||
cgc->sense->sense_key = 0x05;
|
||||
cgc->sense->asc = 0x20;
|
||||
cgc->sense->ascq = 0x00;
|
||||
if (cgc->sshdr) {
|
||||
cgc->sshdr->sense_key = 0x05;
|
||||
cgc->sshdr->asc = 0x20;
|
||||
cgc->sshdr->ascq = 0x00;
|
||||
}
|
||||
|
||||
cgc->stat = -EIO;
|
||||
@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
|
||||
|
||||
blk_execute_rq(q, cdi->disk, rq, 0);
|
||||
if (scsi_req(rq)->result) {
|
||||
struct request_sense *s = req->sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
|
||||
ret = -EIO;
|
||||
cdi->last_sense = s->sense_key;
|
||||
scsi_normalize_sense(req->sense, req->sense_len,
|
||||
&sshdr);
|
||||
cdi->last_sense = sshdr.sense_key;
|
||||
}
|
||||
|
||||
if (blk_rq_unmap_user(bio))
|
||||
@ -2943,7 +2947,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
|
||||
struct packet_command *cgc,
|
||||
int cmd)
|
||||
{
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
struct cdrom_msf msf;
|
||||
int blocksize = 0, format = 0, lba;
|
||||
int ret;
|
||||
@ -2971,13 +2975,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
|
||||
if (cgc->buffer == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
memset(&sense, 0, sizeof(sense));
|
||||
cgc->sense = &sense;
|
||||
memset(&sshdr, 0, sizeof(sshdr));
|
||||
cgc->sshdr = &sshdr;
|
||||
cgc->data_direction = CGC_DATA_READ;
|
||||
ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
|
||||
if (ret && sense.sense_key == 0x05 &&
|
||||
sense.asc == 0x20 &&
|
||||
sense.ascq == 0x00) {
|
||||
if (ret && sshdr.sense_key == 0x05 &&
|
||||
sshdr.asc == 0x20 &&
|
||||
sshdr.ascq == 0x00) {
|
||||
/*
|
||||
* SCSI-II devices are not required to support
|
||||
* READ_CD, so let's try switching block size
|
||||
@ -2986,7 +2990,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
|
||||
ret = cdrom_switch_blocksize(cdi, blocksize);
|
||||
if (ret)
|
||||
goto out;
|
||||
cgc->sense = NULL;
|
||||
cgc->sshdr = NULL;
|
||||
ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
|
||||
ret |= cdrom_switch_blocksize(cdi, blocksize);
|
||||
}
|
||||
|
@ -419,10 +419,11 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
|
||||
|
||||
int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
|
||||
int write, void *buffer, unsigned *bufflen,
|
||||
struct request_sense *sense, int timeout,
|
||||
struct scsi_sense_hdr *sshdr, int timeout,
|
||||
req_flags_t rq_flags)
|
||||
{
|
||||
struct cdrom_info *info = drive->driver_data;
|
||||
struct scsi_sense_hdr local_sshdr;
|
||||
int retries = 10;
|
||||
bool failed;
|
||||
|
||||
@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
|
||||
"rq_flags: 0x%x",
|
||||
cmd[0], write, timeout, rq_flags);
|
||||
|
||||
if (!sshdr)
|
||||
sshdr = &local_sshdr;
|
||||
|
||||
/* start of retry loop */
|
||||
do {
|
||||
struct request *rq;
|
||||
@ -456,8 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
|
||||
|
||||
if (buffer)
|
||||
*bufflen = scsi_req(rq)->resid_len;
|
||||
if (sense)
|
||||
memcpy(sense, scsi_req(rq)->sense, sizeof(*sense));
|
||||
scsi_normalize_sense(scsi_req(rq)->sense,
|
||||
scsi_req(rq)->sense_len, sshdr);
|
||||
|
||||
/*
|
||||
* FIXME: we should probably abort/retry or something in case of
|
||||
@ -469,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
|
||||
* The request failed. Retry if it was due to a unit
|
||||
* attention status (usually means media was changed).
|
||||
*/
|
||||
struct request_sense *reqbuf = scsi_req(rq)->sense;
|
||||
|
||||
if (reqbuf->sense_key == UNIT_ATTENTION)
|
||||
if (sshdr->sense_key == UNIT_ATTENTION)
|
||||
cdrom_saw_media_change(drive);
|
||||
else if (reqbuf->sense_key == NOT_READY &&
|
||||
reqbuf->asc == 4 && reqbuf->ascq != 4) {
|
||||
else if (sshdr->sense_key == NOT_READY &&
|
||||
sshdr->asc == 4 && sshdr->ascq != 4) {
|
||||
/*
|
||||
* The drive is in the process of loading
|
||||
* a disk. Retry, but wait a little to give
|
||||
@ -864,7 +866,7 @@ static void msf_from_bcd(struct atapi_msf *msf)
|
||||
msf->frame = bcd2bin(msf->frame);
|
||||
}
|
||||
|
||||
int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
|
||||
int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
|
||||
{
|
||||
struct cdrom_info *info = drive->driver_data;
|
||||
struct cdrom_device_info *cdi;
|
||||
@ -886,12 +888,11 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
|
||||
*/
|
||||
cmd[7] = cdi->sanyo_slot % 3;
|
||||
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET);
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
|
||||
}
|
||||
|
||||
static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
|
||||
unsigned long *sectors_per_frame,
|
||||
struct request_sense *sense)
|
||||
unsigned long *sectors_per_frame)
|
||||
{
|
||||
struct {
|
||||
__be32 lba;
|
||||
@ -908,7 +909,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
|
||||
memset(cmd, 0, BLK_MAX_CDB);
|
||||
cmd[0] = GPCMD_READ_CDVD_CAPACITY;
|
||||
|
||||
stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0,
|
||||
stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0,
|
||||
RQF_QUIET);
|
||||
if (stat)
|
||||
return stat;
|
||||
@ -944,8 +945,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
|
||||
}
|
||||
|
||||
static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
|
||||
int format, char *buf, int buflen,
|
||||
struct request_sense *sense)
|
||||
int format, char *buf, int buflen)
|
||||
{
|
||||
unsigned char cmd[BLK_MAX_CDB];
|
||||
|
||||
@ -962,11 +962,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
|
||||
if (msf_flag)
|
||||
cmd[1] = 2;
|
||||
|
||||
return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET);
|
||||
return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET);
|
||||
}
|
||||
|
||||
/* Try to read the entire TOC for the disk into our internal buffer. */
|
||||
int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
int ide_cd_read_toc(ide_drive_t *drive)
|
||||
{
|
||||
int stat, ntracks, i;
|
||||
struct cdrom_info *info = drive->driver_data;
|
||||
@ -996,14 +996,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
* Check to see if the existing data is still valid. If it is,
|
||||
* just return.
|
||||
*/
|
||||
(void) cdrom_check_status(drive, sense);
|
||||
(void) cdrom_check_status(drive, NULL);
|
||||
|
||||
if (drive->atapi_flags & IDE_AFLAG_TOC_VALID)
|
||||
return 0;
|
||||
|
||||
/* try to get the total cdrom capacity and sector size */
|
||||
stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame,
|
||||
sense);
|
||||
stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame);
|
||||
if (stat)
|
||||
toc->capacity = 0x1fffff;
|
||||
|
||||
@ -1016,7 +1015,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
|
||||
/* first read just the header, so we know how long the TOC is */
|
||||
stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
|
||||
sizeof(struct atapi_toc_header), sense);
|
||||
sizeof(struct atapi_toc_header));
|
||||
if (stat)
|
||||
return stat;
|
||||
|
||||
@ -1036,7 +1035,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
(char *)&toc->hdr,
|
||||
sizeof(struct atapi_toc_header) +
|
||||
(ntracks + 1) *
|
||||
sizeof(struct atapi_toc_entry), sense);
|
||||
sizeof(struct atapi_toc_entry));
|
||||
|
||||
if (stat && toc->hdr.first_track > 1) {
|
||||
/*
|
||||
@ -1056,8 +1055,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
(char *)&toc->hdr,
|
||||
sizeof(struct atapi_toc_header) +
|
||||
(ntracks + 1) *
|
||||
sizeof(struct atapi_toc_entry),
|
||||
sense);
|
||||
sizeof(struct atapi_toc_entry));
|
||||
if (stat)
|
||||
return stat;
|
||||
|
||||
@ -1094,7 +1092,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
if (toc->hdr.first_track != CDROM_LEADOUT) {
|
||||
/* read the multisession information */
|
||||
stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp,
|
||||
sizeof(ms_tmp), sense);
|
||||
sizeof(ms_tmp));
|
||||
if (stat)
|
||||
return stat;
|
||||
|
||||
@ -1108,7 +1106,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
|
||||
if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
|
||||
/* re-read multisession information using MSF format */
|
||||
stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp,
|
||||
sizeof(ms_tmp), sense);
|
||||
sizeof(ms_tmp));
|
||||
if (stat)
|
||||
return stat;
|
||||
|
||||
@ -1412,7 +1410,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive)
|
||||
{
|
||||
unsigned long capacity, sectors_per_frame;
|
||||
|
||||
if (cdrom_read_capacity(drive, &capacity, §ors_per_frame, NULL))
|
||||
if (cdrom_read_capacity(drive, &capacity, §ors_per_frame))
|
||||
return 0;
|
||||
|
||||
return capacity * sectors_per_frame;
|
||||
@ -1710,9 +1708,8 @@ static unsigned int idecd_check_events(struct gendisk *disk,
|
||||
static int idecd_revalidate_disk(struct gendisk *disk)
|
||||
{
|
||||
struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
|
||||
struct request_sense sense;
|
||||
|
||||
ide_cd_read_toc(info->drive, &sense);
|
||||
ide_cd_read_toc(info->drive);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1736,7 +1733,6 @@ static int ide_cd_probe(ide_drive_t *drive)
|
||||
{
|
||||
struct cdrom_info *info;
|
||||
struct gendisk *g;
|
||||
struct request_sense sense;
|
||||
|
||||
ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
|
||||
drive->driver_req, drive->media);
|
||||
@ -1785,7 +1781,7 @@ static int ide_cd_probe(ide_drive_t *drive)
|
||||
goto failed;
|
||||
}
|
||||
|
||||
ide_cd_read_toc(drive, &sense);
|
||||
ide_cd_read_toc(drive);
|
||||
g->fops = &idecd_ops;
|
||||
g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
|
||||
device_add_disk(&drive->gendev, g);
|
||||
|
@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
|
||||
|
||||
/* ide-cd.c functions used by ide-cd_ioctl.c */
|
||||
int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
|
||||
unsigned *, struct request_sense *, int, req_flags_t);
|
||||
int ide_cd_read_toc(ide_drive_t *, struct request_sense *);
|
||||
unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
|
||||
int ide_cd_read_toc(ide_drive_t *);
|
||||
int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
|
||||
void ide_cdrom_update_speed(ide_drive_t *, u8 *);
|
||||
int cdrom_check_status(ide_drive_t *, struct request_sense *);
|
||||
int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
|
||||
|
||||
/* ide-cd_ioctl.c */
|
||||
int ide_cdrom_open_real(struct cdrom_device_info *, int);
|
||||
|
@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
|
||||
{
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
struct media_event_desc med;
|
||||
struct request_sense sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
int stat;
|
||||
|
||||
if (slot_nr != CDSL_CURRENT)
|
||||
return -EINVAL;
|
||||
|
||||
stat = cdrom_check_status(drive, &sense);
|
||||
if (!stat || sense.sense_key == UNIT_ATTENTION)
|
||||
stat = cdrom_check_status(drive, &sshdr);
|
||||
if (!stat || sshdr.sense_key == UNIT_ATTENTION)
|
||||
return CDS_DISC_OK;
|
||||
|
||||
if (!cdrom_get_media_event(cdi, &med)) {
|
||||
@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
|
||||
return CDS_NO_DISC;
|
||||
}
|
||||
|
||||
if (sense.sense_key == NOT_READY && sense.asc == 0x04
|
||||
&& sense.ascq == 0x04)
|
||||
if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
|
||||
&& sshdr.ascq == 0x04)
|
||||
return CDS_DISC_OK;
|
||||
|
||||
/*
|
||||
@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
|
||||
* just return TRAY_OPEN since ATAPI doesn't provide
|
||||
* any other way to detect this...
|
||||
*/
|
||||
if (sense.sense_key == NOT_READY) {
|
||||
if (sense.asc == 0x3a && sense.ascq == 1)
|
||||
if (sshdr.sense_key == NOT_READY) {
|
||||
if (sshdr.asc == 0x3a && sshdr.ascq == 1)
|
||||
return CDS_NO_DISC;
|
||||
else
|
||||
return CDS_TRAY_OPEN;
|
||||
@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
|
||||
/* Eject the disk if EJECTFLAG is 0.
|
||||
If EJECTFLAG is 1, try to reload the disk. */
|
||||
static
|
||||
int cdrom_eject(ide_drive_t *drive, int ejectflag,
|
||||
struct request_sense *sense)
|
||||
int cdrom_eject(ide_drive_t *drive, int ejectflag)
|
||||
{
|
||||
struct cdrom_info *cd = drive->driver_data;
|
||||
struct cdrom_device_info *cdi = &cd->devinfo;
|
||||
@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag,
|
||||
cmd[0] = GPCMD_START_STOP_UNIT;
|
||||
cmd[4] = loej | (ejectflag != 0);
|
||||
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0);
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
|
||||
}
|
||||
|
||||
/* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */
|
||||
static
|
||||
int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
|
||||
struct request_sense *sense)
|
||||
int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
|
||||
{
|
||||
struct request_sense my_sense;
|
||||
struct scsi_sense_hdr sshdr;
|
||||
int stat;
|
||||
|
||||
if (sense == NULL)
|
||||
sense = &my_sense;
|
||||
|
||||
/* If the drive cannot lock the door, just pretend. */
|
||||
if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
|
||||
stat = 0;
|
||||
@ -155,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
|
||||
cmd[4] = lockflag ? 1 : 0;
|
||||
|
||||
stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
|
||||
sense, 0, 0);
|
||||
&sshdr, 0, 0);
|
||||
}
|
||||
|
||||
/* If we got an illegal field error, the drive
|
||||
probably cannot lock the door. */
|
||||
if (stat != 0 &&
|
||||
sense->sense_key == ILLEGAL_REQUEST &&
|
||||
(sense->asc == 0x24 || sense->asc == 0x20)) {
|
||||
sshdr.sense_key == ILLEGAL_REQUEST &&
|
||||
(sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
|
||||
printk(KERN_ERR "%s: door locking not supported\n",
|
||||
drive->name);
|
||||
drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
|
||||
@ -170,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
|
||||
}
|
||||
|
||||
/* no medium, that's alright. */
|
||||
if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a)
|
||||
if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
|
||||
stat = 0;
|
||||
|
||||
if (stat == 0) {
|
||||
@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
|
||||
int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position)
|
||||
{
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
struct request_sense sense;
|
||||
|
||||
if (position) {
|
||||
int stat = ide_cd_lockdoor(drive, 0, &sense);
|
||||
int stat = ide_cd_lockdoor(drive, 0);
|
||||
|
||||
if (stat)
|
||||
return stat;
|
||||
}
|
||||
|
||||
return cdrom_eject(drive, !position, &sense);
|
||||
return cdrom_eject(drive, !position);
|
||||
}
|
||||
|
||||
int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock)
|
||||
{
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
|
||||
return ide_cd_lockdoor(drive, lock, NULL);
|
||||
return ide_cd_lockdoor(drive, lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
|
||||
{
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
struct cdrom_info *cd = drive->driver_data;
|
||||
struct request_sense sense;
|
||||
u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
|
||||
int stat;
|
||||
unsigned char cmd[BLK_MAX_CDB];
|
||||
@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
|
||||
cmd[5] = speed & 0xff;
|
||||
}
|
||||
|
||||
stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
|
||||
stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
|
||||
|
||||
if (!ide_cdrom_get_capabilities(drive, buf)) {
|
||||
ide_cdrom_update_speed(drive, buf);
|
||||
@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi,
|
||||
struct atapi_toc *toc;
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
struct cdrom_info *info = drive->driver_data;
|
||||
struct request_sense sense;
|
||||
int ret;
|
||||
|
||||
if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) {
|
||||
ret = ide_cd_read_toc(drive, &sense);
|
||||
ret = ide_cd_read_toc(drive);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
|
||||
{
|
||||
ide_drive_t *drive = cdi->handle;
|
||||
struct cdrom_info *cd = drive->driver_data;
|
||||
struct request_sense sense;
|
||||
struct request *rq;
|
||||
int ret;
|
||||
|
||||
@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
|
||||
* lock it again.
|
||||
*/
|
||||
if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED)
|
||||
(void)ide_cd_lockdoor(drive, 1, &sense);
|
||||
(void)ide_cd_lockdoor(drive, 1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
|
||||
struct atapi_toc_entry *first_toc, *last_toc;
|
||||
unsigned long lba_start, lba_end;
|
||||
int stat;
|
||||
struct request_sense sense;
|
||||
unsigned char cmd[BLK_MAX_CDB];
|
||||
|
||||
stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc);
|
||||
@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
|
||||
lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]);
|
||||
lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]);
|
||||
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
|
||||
return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
|
||||
}
|
||||
|
||||
static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
|
||||
@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
|
||||
int stat;
|
||||
|
||||
/* Make sure our saved TOC is valid. */
|
||||
stat = ide_cd_read_toc(drive, NULL);
|
||||
stat = ide_cd_read_toc(drive);
|
||||
if (stat)
|
||||
return stat;
|
||||
|
||||
@ -461,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
|
||||
layer. the packet must be complete, as we do not
|
||||
touch it at all. */
|
||||
|
||||
if (cgc->sense)
|
||||
memset(cgc->sense, 0, sizeof(struct request_sense));
|
||||
if (cgc->sshdr)
|
||||
memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
|
||||
|
||||
if (cgc->quiet)
|
||||
flags |= RQF_QUIET;
|
||||
@ -470,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
|
||||
cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
|
||||
cgc->data_direction == CGC_DATA_WRITE,
|
||||
cgc->buffer, &len,
|
||||
cgc->sense, cgc->timeout, flags);
|
||||
cgc->sshdr, cgc->timeout, flags);
|
||||
if (!cgc->stat)
|
||||
cgc->buflen -= len;
|
||||
return cgc->stat;
|
||||
|
@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
|
||||
{
|
||||
domain->sig_type = IB_SIG_TYPE_T10_DIF;
|
||||
domain->sig.dif.pi_interval = scsi_prot_interval(sc);
|
||||
domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
|
||||
domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
|
||||
/*
|
||||
* At the moment we hard code those, but in the future
|
||||
* we will take them from sc.
|
||||
|
@ -17,23 +17,25 @@ menuconfig NVM
|
||||
|
||||
if NVM
|
||||
|
||||
config NVM_DEBUG
|
||||
bool "Open-Channel SSD debugging support"
|
||||
default n
|
||||
---help---
|
||||
Exposes a debug management interface to create/remove targets at:
|
||||
|
||||
/sys/module/lnvm/parameters/configure_debug
|
||||
|
||||
It is required to create/remove targets without IOCTLs.
|
||||
|
||||
config NVM_PBLK
|
||||
tristate "Physical Block Device Open-Channel SSD target"
|
||||
---help---
|
||||
help
|
||||
Allows an open-channel SSD to be exposed as a block device to the
|
||||
host. The target assumes the device exposes raw flash and must be
|
||||
explicitly managed by the host.
|
||||
|
||||
Please note the disk format is considered EXPERIMENTAL for now.
|
||||
|
||||
if NVM_PBLK
|
||||
|
||||
config NVM_PBLK_DEBUG
|
||||
bool "PBlk Debug Support"
|
||||
default n
|
||||
help
|
||||
Enables debug support for pblk. This includes extra checks, more
|
||||
vocal error messages, and extra tracking fields in the pblk sysfs
|
||||
entries.
|
||||
|
||||
endif # NVM_PBLK_DEBUG
|
||||
|
||||
endif # NVM
|
||||
|
@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
|
||||
int nr_entries = pblk_get_secs(bio);
|
||||
int i, ret;
|
||||
|
||||
generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
|
||||
generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
|
||||
&pblk->disk->part0);
|
||||
|
||||
/* Update the write buffer head (mem) with the entries that we can
|
||||
* write. The write in itself cannot fail, so there is no need to
|
||||
@ -67,7 +68,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
|
||||
|
||||
atomic64_add(nr_entries, &pblk->user_wa);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(nr_entries, &pblk->inflight_writes);
|
||||
atomic_long_add(nr_entries, &pblk->req_writes);
|
||||
#endif
|
||||
@ -75,7 +76,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
|
||||
pblk_rl_inserted(&pblk->rl, nr_entries);
|
||||
|
||||
out:
|
||||
generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
|
||||
generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
|
||||
pblk_write_should_kick(pblk);
|
||||
return ret;
|
||||
}
|
||||
@ -123,7 +124,7 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
||||
|
||||
atomic64_add(valid_entries, &pblk->gc_wa);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(valid_entries, &pblk->inflight_writes);
|
||||
atomic_long_add(valid_entries, &pblk->recov_gc_writes);
|
||||
#endif
|
||||
|
@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
|
||||
line = &pblk->lines[pblk_ppa_to_line(*ppa)];
|
||||
pos = pblk_ppa_to_pos(&dev->geo, *ppa);
|
||||
|
||||
pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
|
||||
pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
|
||||
line->id, pos);
|
||||
}
|
||||
|
||||
@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
|
||||
struct ppa_addr *ppa;
|
||||
int pos = pblk_ppa_to_pos(geo, ppa_addr);
|
||||
|
||||
pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
|
||||
pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
|
||||
atomic_long_inc(&pblk->erase_failed);
|
||||
|
||||
atomic_dec(&line->blk_in_line);
|
||||
if (test_and_set_bit(pos, line->blk_bitmap))
|
||||
pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
|
||||
pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
|
||||
line->id, pos);
|
||||
|
||||
/* Not necessary to mark bad blocks on 2.0 spec. */
|
||||
@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
|
||||
u64 paddr;
|
||||
int line_id;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Callers must ensure that the ppa points to a device address */
|
||||
BUG_ON(pblk_addr_in_cache(ppa));
|
||||
BUG_ON(pblk_ppa_empty(ppa));
|
||||
@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
|
||||
switch (type) {
|
||||
case PBLK_WRITE:
|
||||
kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
|
||||
/* fall through */
|
||||
case PBLK_WRITE_INT:
|
||||
pool = &pblk->w_rq_pool;
|
||||
break;
|
||||
@ -274,7 +275,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
|
||||
pool = &pblk->e_rq_pool;
|
||||
break;
|
||||
default:
|
||||
pr_err("pblk: trying to free unknown rqd type\n");
|
||||
pblk_err(pblk, "trying to free unknown rqd type\n");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -310,7 +311,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
|
||||
|
||||
ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
|
||||
if (ret != PBLK_EXPOSED_PAGE_SIZE) {
|
||||
pr_err("pblk: could not add page to bio\n");
|
||||
pblk_err(pblk, "could not add page to bio\n");
|
||||
mempool_free(page, &pblk->page_bio_pool);
|
||||
goto err;
|
||||
}
|
||||
@ -410,7 +411,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
|
||||
line->state = PBLK_LINESTATE_CORRUPT;
|
||||
line->gc_group = PBLK_LINEGC_NONE;
|
||||
move_list = &l_mg->corrupt_list;
|
||||
pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
|
||||
pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
|
||||
line->id, vsc,
|
||||
line->sec_in_line,
|
||||
lm->high_thrs, lm->mid_thrs);
|
||||
@ -430,7 +431,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
|
||||
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
{
|
||||
atomic_long_inc(&pblk->write_failed);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_print_failed_rqd(pblk, rqd, rqd->error);
|
||||
#endif
|
||||
}
|
||||
@ -452,9 +453,9 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
atomic_long_inc(&pblk->read_failed);
|
||||
break;
|
||||
default:
|
||||
pr_err("pblk: unknown read error:%d\n", rqd->error);
|
||||
pblk_err(pblk, "unknown read error:%d\n", rqd->error);
|
||||
}
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_print_failed_rqd(pblk, rqd, rqd->error);
|
||||
#endif
|
||||
}
|
||||
@ -470,7 +471,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
||||
atomic_inc(&pblk->inflight_io);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
if (pblk_check_io(pblk, rqd))
|
||||
return NVM_IO_ERR;
|
||||
#endif
|
||||
@ -484,7 +485,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
||||
atomic_inc(&pblk->inflight_io);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
if (pblk_check_io(pblk, rqd))
|
||||
return NVM_IO_ERR;
|
||||
#endif
|
||||
@ -517,7 +518,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
|
||||
for (i = 0; i < nr_secs; i++) {
|
||||
page = vmalloc_to_page(kaddr);
|
||||
if (!page) {
|
||||
pr_err("pblk: could not map vmalloc bio\n");
|
||||
pblk_err(pblk, "could not map vmalloc bio\n");
|
||||
bio_put(bio);
|
||||
bio = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
@ -525,7 +526,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
|
||||
|
||||
ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
|
||||
if (ret != PAGE_SIZE) {
|
||||
pr_err("pblk: could not add page to bio\n");
|
||||
pblk_err(pblk, "could not add page to bio\n");
|
||||
bio_put(bio);
|
||||
bio = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
@ -711,7 +712,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
|
||||
while (test_bit(pos, line->blk_bitmap)) {
|
||||
paddr += min;
|
||||
if (pblk_boundary_paddr_checks(pblk, paddr)) {
|
||||
pr_err("pblk: corrupt emeta line:%d\n",
|
||||
pblk_err(pblk, "corrupt emeta line:%d\n",
|
||||
line->id);
|
||||
bio_put(bio);
|
||||
ret = -EINTR;
|
||||
@ -723,7 +724,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
|
||||
}
|
||||
|
||||
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
|
||||
pr_err("pblk: corrupt emeta line:%d\n",
|
||||
pblk_err(pblk, "corrupt emeta line:%d\n",
|
||||
line->id);
|
||||
bio_put(bio);
|
||||
ret = -EINTR;
|
||||
@ -738,7 +739,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, &rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: emeta I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
goto free_rqd_dma;
|
||||
}
|
||||
@ -843,7 +844,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
|
||||
*/
|
||||
ret = pblk_submit_io_sync(pblk, &rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: smeta I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
goto free_ppa_list;
|
||||
}
|
||||
@ -905,7 +906,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
pr_err("pblk: could not sync erase line:%d,blk:%d\n",
|
||||
pblk_err(pblk, "could not sync erase line:%d,blk:%d\n",
|
||||
pblk_ppa_to_line(ppa),
|
||||
pblk_ppa_to_pos(geo, ppa));
|
||||
|
||||
@ -945,7 +946,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
|
||||
|
||||
ret = pblk_blk_erase_sync(pblk, ppa);
|
||||
if (ret) {
|
||||
pr_err("pblk: failed to erase line %d\n", line->id);
|
||||
pblk_err(pblk, "failed to erase line %d\n", line->id);
|
||||
return ret;
|
||||
}
|
||||
} while (1);
|
||||
@ -1012,7 +1013,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
|
||||
list_add_tail(&line->list, &l_mg->bad_list);
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
pr_debug("pblk: line %d is bad\n", line->id);
|
||||
pblk_debug(pblk, "line %d is bad\n", line->id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1122,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
|
||||
line->cur_sec = off + lm->smeta_sec;
|
||||
|
||||
if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
|
||||
pr_debug("pblk: line smeta I/O failed. Retry\n");
|
||||
pblk_debug(pblk, "line smeta I/O failed. Retry\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1154,7 +1155,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
|
||||
spin_unlock(&line->lock);
|
||||
|
||||
list_add_tail(&line->list, &l_mg->bad_list);
|
||||
pr_err("pblk: unexpected line %d is bad\n", line->id);
|
||||
pblk_err(pblk, "unexpected line %d is bad\n", line->id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1299,7 +1300,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
|
||||
|
||||
retry:
|
||||
if (list_empty(&l_mg->free_list)) {
|
||||
pr_err("pblk: no free lines\n");
|
||||
pblk_err(pblk, "no free lines\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1315,7 +1316,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
|
||||
|
||||
list_add_tail(&line->list, &l_mg->bad_list);
|
||||
|
||||
pr_debug("pblk: line %d is bad\n", line->id);
|
||||
pblk_debug(pblk, "line %d is bad\n", line->id);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
@ -1329,7 +1330,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
|
||||
list_add(&line->list, &l_mg->corrupt_list);
|
||||
goto retry;
|
||||
default:
|
||||
pr_err("pblk: failed to prepare line %d\n", line->id);
|
||||
pblk_err(pblk, "failed to prepare line %d\n", line->id);
|
||||
list_add(&line->list, &l_mg->free_list);
|
||||
l_mg->nr_free_lines++;
|
||||
return NULL;
|
||||
@ -1477,7 +1478,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
|
||||
|
||||
ret = pblk_submit_meta_io(pblk, line);
|
||||
if (ret) {
|
||||
pr_err("pblk: sync meta line %d failed (%d)\n",
|
||||
pblk_err(pblk, "sync meta line %d failed (%d)\n",
|
||||
line->id, ret);
|
||||
return;
|
||||
}
|
||||
@ -1507,7 +1508,7 @@ void __pblk_pipeline_flush(struct pblk *pblk)
|
||||
|
||||
ret = pblk_recov_pad(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not close data on teardown(%d)\n", ret);
|
||||
pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1687,7 +1688,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
|
||||
struct nvm_tgt_dev *dev = pblk->dev;
|
||||
struct nvm_geo *geo = &dev->geo;
|
||||
|
||||
pr_err("pblk: could not async erase line:%d,blk:%d\n",
|
||||
pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
|
||||
pblk_ppa_to_line(ppa),
|
||||
pblk_ppa_to_pos(geo, ppa));
|
||||
}
|
||||
@ -1726,7 +1727,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
|
||||
struct list_head *move_list;
|
||||
int i;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
|
||||
"pblk: corrupt closed line %d\n", line->id);
|
||||
#endif
|
||||
@ -1856,7 +1857,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
|
||||
* Only send one inflight I/O per LUN. Since we map at a page
|
||||
* granurality, all ppas in the I/O will map to the same LUN
|
||||
*/
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
int i;
|
||||
|
||||
for (i = 1; i < nr_ppas; i++)
|
||||
@ -1866,7 +1867,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
|
||||
|
||||
ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
|
||||
if (ret == -ETIME || ret == -EINTR)
|
||||
pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
|
||||
pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
|
||||
-ret);
|
||||
}
|
||||
|
||||
void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
|
||||
@ -1901,7 +1903,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
|
||||
struct pblk_lun *rlun;
|
||||
int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
int i;
|
||||
|
||||
for (i = 1; i < nr_ppas; i++)
|
||||
@ -1951,7 +1953,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
|
||||
void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
|
||||
{
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Callers must ensure that the ppa points to a cache address */
|
||||
BUG_ON(!pblk_addr_in_cache(ppa));
|
||||
BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
|
||||
@ -1966,7 +1968,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
|
||||
struct ppa_addr ppa_l2p, ppa_gc;
|
||||
int ret = 1;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Callers must ensure that the ppa points to a cache address */
|
||||
BUG_ON(!pblk_addr_in_cache(ppa_new));
|
||||
BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
|
||||
@ -2003,14 +2005,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
|
||||
{
|
||||
struct ppa_addr ppa_l2p;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Callers must ensure that the ppa points to a device address */
|
||||
BUG_ON(pblk_addr_in_cache(ppa_mapped));
|
||||
#endif
|
||||
/* Invalidate and discard padded entries */
|
||||
if (lba == ADDR_EMPTY) {
|
||||
atomic64_inc(&pblk->pad_wa);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_inc(&pblk->padded_wb);
|
||||
#endif
|
||||
if (!pblk_ppa_empty(ppa_mapped))
|
||||
@ -2036,7 +2038,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
|
||||
goto out;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
|
||||
#endif
|
||||
|
||||
|
@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
|
||||
|
||||
gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
|
||||
if (!gc_rq->data) {
|
||||
pr_err("pblk: could not GC line:%d (%d/%d)\n",
|
||||
pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
|
||||
line->id, *line->vsc, gc_rq->nr_secs);
|
||||
goto out;
|
||||
}
|
||||
@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
|
||||
/* Read from GC victim block */
|
||||
ret = pblk_submit_read_gc(pblk, gc_rq);
|
||||
if (ret) {
|
||||
pr_err("pblk: failed GC read in line:%d (err:%d)\n",
|
||||
pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
|
||||
line->id, ret);
|
||||
goto out;
|
||||
}
|
||||
@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
|
||||
|
||||
ret = pblk_line_read_emeta(pblk, line, emeta_buf);
|
||||
if (ret) {
|
||||
pr_err("pblk: line %d read emeta failed (%d)\n",
|
||||
pblk_err(pblk, "line %d read emeta failed (%d)\n",
|
||||
line->id, ret);
|
||||
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
|
||||
return NULL;
|
||||
@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
|
||||
|
||||
ret = pblk_recov_check_emeta(pblk, emeta_buf);
|
||||
if (ret) {
|
||||
pr_err("pblk: inconsistent emeta (line %d)\n",
|
||||
pblk_err(pblk, "inconsistent emeta (line %d)\n",
|
||||
line->id);
|
||||
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
|
||||
return NULL;
|
||||
@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
||||
} else {
|
||||
lba_list = get_lba_list_from_emeta(pblk, line);
|
||||
if (!lba_list) {
|
||||
pr_err("pblk: could not interpret emeta (line %d)\n",
|
||||
pblk_err(pblk, "could not interpret emeta (line %d)\n",
|
||||
line->id);
|
||||
goto fail_free_invalid_bitmap;
|
||||
}
|
||||
@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
||||
spin_unlock(&line->lock);
|
||||
|
||||
if (sec_left < 0) {
|
||||
pr_err("pblk: corrupted GC line (%d)\n", line->id);
|
||||
pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
|
||||
goto fail_free_lba_list;
|
||||
}
|
||||
|
||||
@ -289,7 +289,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
|
||||
kref_put(&line->ref, pblk_line_put);
|
||||
atomic_dec(&gc->read_inflight_gc);
|
||||
|
||||
pr_err("pblk: Failed to GC line %d\n", line->id);
|
||||
pblk_err(pblk, "failed to GC line %d\n", line->id);
|
||||
}
|
||||
|
||||
static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
|
||||
@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
|
||||
struct pblk_gc *gc = &pblk->gc;
|
||||
struct pblk_line_ws *line_ws;
|
||||
|
||||
pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
|
||||
pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
|
||||
|
||||
line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
|
||||
if (!line_ws)
|
||||
@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk)
|
||||
pblk_gc_kick(pblk);
|
||||
|
||||
if (pblk_gc_line(pblk, line))
|
||||
pr_err("pblk: failed to GC line %d\n", line->id);
|
||||
pblk_err(pblk, "failed to GC line %d\n", line->id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -522,8 +522,8 @@ static int pblk_gc_reader_ts(void *data)
|
||||
io_schedule();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
pr_info("pblk: flushing gc pipeline, %d lines left\n",
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
|
||||
atomic_read(&gc->pipeline_gc));
|
||||
#endif
|
||||
|
||||
@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data)
|
||||
static void pblk_gc_start(struct pblk *pblk)
|
||||
{
|
||||
pblk->gc.gc_active = 1;
|
||||
pr_debug("pblk: gc start\n");
|
||||
pblk_debug(pblk, "gc start\n");
|
||||
}
|
||||
|
||||
void pblk_gc_should_start(struct pblk *pblk)
|
||||
@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk)
|
||||
|
||||
gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
|
||||
if (IS_ERR(gc->gc_ts)) {
|
||||
pr_err("pblk: could not allocate GC main kthread\n");
|
||||
pblk_err(pblk, "could not allocate GC main kthread\n");
|
||||
return PTR_ERR(gc->gc_ts);
|
||||
}
|
||||
|
||||
gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
|
||||
"pblk-gc-writer-ts");
|
||||
if (IS_ERR(gc->gc_writer_ts)) {
|
||||
pr_err("pblk: could not allocate GC writer kthread\n");
|
||||
pblk_err(pblk, "could not allocate GC writer kthread\n");
|
||||
ret = PTR_ERR(gc->gc_writer_ts);
|
||||
goto fail_free_main_kthread;
|
||||
}
|
||||
@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk)
|
||||
gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
|
||||
"pblk-gc-reader-ts");
|
||||
if (IS_ERR(gc->gc_reader_ts)) {
|
||||
pr_err("pblk: could not allocate GC reader kthread\n");
|
||||
pblk_err(pblk, "could not allocate GC reader kthread\n");
|
||||
ret = PTR_ERR(gc->gc_reader_ts);
|
||||
goto fail_free_writer_kthread;
|
||||
}
|
||||
@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk)
|
||||
gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
|
||||
WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
|
||||
if (!gc->gc_line_reader_wq) {
|
||||
pr_err("pblk: could not allocate GC line reader workqueue\n");
|
||||
pblk_err(pblk, "could not allocate GC line reader workqueue\n");
|
||||
ret = -ENOMEM;
|
||||
goto fail_free_reader_kthread;
|
||||
}
|
||||
@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk)
|
||||
gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
|
||||
WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
|
||||
if (!gc->gc_reader_wq) {
|
||||
pr_err("pblk: could not allocate GC reader workqueue\n");
|
||||
pblk_err(pblk, "could not allocate GC reader workqueue\n");
|
||||
ret = -ENOMEM;
|
||||
goto fail_free_reader_line_wq;
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk)
|
||||
return entry_size * pblk->rl.nr_secs;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
static u32 pblk_l2p_crc(struct pblk *pblk)
|
||||
{
|
||||
size_t map_size;
|
||||
@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
|
||||
} else {
|
||||
line = pblk_recov_l2p(pblk);
|
||||
if (IS_ERR(line)) {
|
||||
pr_err("pblk: could not recover l2p table\n");
|
||||
pblk_err(pblk, "could not recover l2p table\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
|
||||
#endif
|
||||
|
||||
/* Free full lines directly as GC has not been started yet */
|
||||
@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
|
||||
static void pblk_rwb_free(struct pblk *pblk)
|
||||
{
|
||||
if (pblk_rb_tear_down_check(&pblk->rwb))
|
||||
pr_err("pblk: write buffer error on tear down\n");
|
||||
pblk_err(pblk, "write buffer error on tear down\n");
|
||||
|
||||
pblk_rb_data_free(&pblk->rwb);
|
||||
vfree(pblk_rb_entries_ref(&pblk->rwb));
|
||||
@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk)
|
||||
struct pblk_rb_entry *entries;
|
||||
unsigned long nr_entries, buffer_size;
|
||||
unsigned int power_size, power_seg_sz;
|
||||
int pgs_in_buffer;
|
||||
|
||||
if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
|
||||
pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns;
|
||||
|
||||
if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
|
||||
buffer_size = write_buffer_size;
|
||||
else
|
||||
buffer_size = pblk->pgs_in_buffer;
|
||||
buffer_size = pgs_in_buffer;
|
||||
|
||||
nr_entries = pblk_rb_calculate_size(buffer_size);
|
||||
|
||||
@ -200,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk)
|
||||
/* Minimum pages needed within a lun */
|
||||
#define ADDR_POOL_SIZE 64
|
||||
|
||||
static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
|
||||
static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
|
||||
struct nvm_addrf_12 *dst)
|
||||
{
|
||||
struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
|
||||
int power_len;
|
||||
@ -208,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
|
||||
/* Re-calculate channel and lun format to adapt to configuration */
|
||||
power_len = get_count_order(geo->num_ch);
|
||||
if (1 << power_len != geo->num_ch) {
|
||||
pr_err("pblk: supports only power-of-two channel config.\n");
|
||||
pblk_err(pblk, "supports only power-of-two channel config.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
dst->ch_len = power_len;
|
||||
|
||||
power_len = get_count_order(geo->num_lun);
|
||||
if (1 << power_len != geo->num_lun) {
|
||||
pr_err("pblk: supports only power-of-two LUN config.\n");
|
||||
pblk_err(pblk, "supports only power-of-two LUN config.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
dst->lun_len = power_len;
|
||||
@ -282,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk)
|
||||
case NVM_OCSSD_SPEC_12:
|
||||
div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
|
||||
if (mod) {
|
||||
pr_err("pblk: bad configuration of sectors/pages\n");
|
||||
pblk_err(pblk, "bad configuration of sectors/pages\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf);
|
||||
pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
|
||||
(void *)&pblk->addrf);
|
||||
break;
|
||||
case NVM_OCSSD_SPEC_20:
|
||||
pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
|
||||
&pblk->uaddrf);
|
||||
break;
|
||||
default:
|
||||
pr_err("pblk: OCSSD revision not supported (%d)\n",
|
||||
pblk_err(pblk, "OCSSD revision not supported (%d)\n",
|
||||
geo->version);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -366,15 +371,13 @@ static int pblk_core_init(struct pblk *pblk)
|
||||
atomic64_set(&pblk->nr_flush, 0);
|
||||
pblk->nr_flush_rst = 0;
|
||||
|
||||
pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns;
|
||||
|
||||
pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
|
||||
max_write_ppas = pblk->min_write_pgs * geo->all_luns;
|
||||
pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
|
||||
pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
|
||||
|
||||
if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
|
||||
pr_err("pblk: vector list too big(%u > %u)\n",
|
||||
pblk_err(pblk, "vector list too big(%u > %u)\n",
|
||||
pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -607,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk)
|
||||
|
||||
/* TODO: Implement unbalanced LUN support */
|
||||
if (geo->num_lun < 0) {
|
||||
pr_err("pblk: unbalanced LUN config.\n");
|
||||
pblk_err(pblk, "unbalanced LUN config.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -716,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
/*
|
||||
* In 1.2 spec. chunk state is not persisted by the device. Thus
|
||||
* some of the values are reset each time pblk is instantiated.
|
||||
* some of the values are reset each time pblk is instantiated,
|
||||
* so we have to assume that the block is closed.
|
||||
*/
|
||||
if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
|
||||
chunk->state = NVM_CHK_ST_FREE;
|
||||
chunk->state = NVM_CHK_ST_CLOSED;
|
||||
else
|
||||
chunk->state = NVM_CHK_ST_OFFLINE;
|
||||
|
||||
@ -1026,7 +1030,7 @@ static int pblk_line_meta_init(struct pblk *pblk)
|
||||
lm->emeta_sec[0], geo->clba);
|
||||
|
||||
if (lm->min_blk_line > lm->blk_per_line) {
|
||||
pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
|
||||
pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
|
||||
lm->blk_per_line);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -1078,7 +1082,7 @@ static int pblk_lines_init(struct pblk *pblk)
|
||||
}
|
||||
|
||||
if (!nr_free_chks) {
|
||||
pr_err("pblk: too many bad blocks prevent for sane instance\n");
|
||||
pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
@ -1108,7 +1112,7 @@ static int pblk_writer_init(struct pblk *pblk)
|
||||
int err = PTR_ERR(pblk->writer_ts);
|
||||
|
||||
if (err != -EINTR)
|
||||
pr_err("pblk: could not allocate writer kthread (%d)\n",
|
||||
pblk_err(pblk, "could not allocate writer kthread (%d)\n",
|
||||
err);
|
||||
return err;
|
||||
}
|
||||
@ -1154,7 +1158,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
|
||||
pblk_rb_sync_l2p(&pblk->rwb);
|
||||
pblk_rl_free(&pblk->rl);
|
||||
|
||||
pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
|
||||
pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
|
||||
}
|
||||
|
||||
static void pblk_exit(void *private, bool graceful)
|
||||
@ -1165,8 +1169,8 @@ static void pblk_exit(void *private, bool graceful)
|
||||
pblk_gc_exit(pblk, graceful);
|
||||
pblk_tear_down(pblk, graceful);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
|
||||
#endif
|
||||
|
||||
pblk_free(pblk);
|
||||
@ -1189,20 +1193,6 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
||||
struct pblk *pblk;
|
||||
int ret;
|
||||
|
||||
/* pblk supports 1.2 and 2.0 versions */
|
||||
if (!(geo->version == NVM_OCSSD_SPEC_12 ||
|
||||
geo->version == NVM_OCSSD_SPEC_20)) {
|
||||
pr_err("pblk: OCSSD version not supported (%u)\n",
|
||||
geo->version);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
|
||||
pr_err("pblk: host-side L2P table not supported. (%x)\n",
|
||||
geo->dom);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
|
||||
if (!pblk)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
@ -1212,11 +1202,26 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
||||
pblk->state = PBLK_STATE_RUNNING;
|
||||
pblk->gc.gc_enabled = 0;
|
||||
|
||||
if (!(geo->version == NVM_OCSSD_SPEC_12 ||
|
||||
geo->version == NVM_OCSSD_SPEC_20)) {
|
||||
pblk_err(pblk, "OCSSD version not supported (%u)\n",
|
||||
geo->version);
|
||||
kfree(pblk);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
|
||||
pblk_err(pblk, "host-side L2P table not supported. (%x)\n",
|
||||
geo->dom);
|
||||
kfree(pblk);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
spin_lock_init(&pblk->resubmit_lock);
|
||||
spin_lock_init(&pblk->trans_lock);
|
||||
spin_lock_init(&pblk->lock);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_set(&pblk->inflight_writes, 0);
|
||||
atomic_long_set(&pblk->padded_writes, 0);
|
||||
atomic_long_set(&pblk->padded_wb, 0);
|
||||
@ -1241,38 +1246,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
||||
|
||||
ret = pblk_core_init(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize core\n");
|
||||
pblk_err(pblk, "could not initialize core\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ret = pblk_lines_init(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize lines\n");
|
||||
pblk_err(pblk, "could not initialize lines\n");
|
||||
goto fail_free_core;
|
||||
}
|
||||
|
||||
ret = pblk_rwb_init(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize write buffer\n");
|
||||
pblk_err(pblk, "could not initialize write buffer\n");
|
||||
goto fail_free_lines;
|
||||
}
|
||||
|
||||
ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize maps\n");
|
||||
pblk_err(pblk, "could not initialize maps\n");
|
||||
goto fail_free_rwb;
|
||||
}
|
||||
|
||||
ret = pblk_writer_init(pblk);
|
||||
if (ret) {
|
||||
if (ret != -EINTR)
|
||||
pr_err("pblk: could not initialize write thread\n");
|
||||
pblk_err(pblk, "could not initialize write thread\n");
|
||||
goto fail_free_l2p;
|
||||
}
|
||||
|
||||
ret = pblk_gc_init(pblk);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not initialize gc\n");
|
||||
pblk_err(pblk, "could not initialize gc\n");
|
||||
goto fail_stop_writer;
|
||||
}
|
||||
|
||||
@ -1287,8 +1292,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
|
||||
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
|
||||
blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
|
||||
|
||||
pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
|
||||
tdisk->disk_name,
|
||||
pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
|
||||
geo->all_luns, pblk->l_mg.nr_lines,
|
||||
(unsigned long long)pblk->rl.nr_secs,
|
||||
pblk->rwb.nr_entries);
|
||||
|
@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
|
||||
} while (iter > 0);
|
||||
up_write(&pblk_rb_lock);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_set(&rb->inflight_flush_point, 0);
|
||||
#endif
|
||||
|
||||
@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
|
||||
|
||||
entry = &rb->entries[ring_pos];
|
||||
flags = READ_ONCE(entry->w_ctx.flags);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Caller must guarantee that the entry is free */
|
||||
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
|
||||
#endif
|
||||
@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
|
||||
|
||||
entry = &rb->entries[ring_pos];
|
||||
flags = READ_ONCE(entry->w_ctx.flags);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Caller must guarantee that the entry is free */
|
||||
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
|
||||
#endif
|
||||
@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_inc(&rb->inflight_flush_point);
|
||||
#endif
|
||||
|
||||
@ -547,7 +547,7 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
|
||||
|
||||
page = virt_to_page(entry->data);
|
||||
if (!page) {
|
||||
pr_err("pblk: could not allocate write bio page\n");
|
||||
pblk_err(pblk, "could not allocate write bio page\n");
|
||||
flags &= ~PBLK_WRITTEN_DATA;
|
||||
flags |= PBLK_SUBMITTED_ENTRY;
|
||||
/* Release flags on context. Protect from writes */
|
||||
@ -557,7 +557,7 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
|
||||
|
||||
if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
|
||||
rb->seg_size) {
|
||||
pr_err("pblk: could not add page to write bio\n");
|
||||
pblk_err(pblk, "could not add page to write bio\n");
|
||||
flags &= ~PBLK_WRITTEN_DATA;
|
||||
flags |= PBLK_SUBMITTED_ENTRY;
|
||||
/* Release flags on context. Protect from writes */
|
||||
@ -576,19 +576,19 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
|
||||
|
||||
if (pad) {
|
||||
if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
|
||||
pr_err("pblk: could not pad page in write bio\n");
|
||||
pblk_err(pblk, "could not pad page in write bio\n");
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
if (pad < pblk->min_write_pgs)
|
||||
atomic64_inc(&pblk->pad_dist[pad - 1]);
|
||||
else
|
||||
pr_warn("pblk: padding more than min. sectors\n");
|
||||
pblk_warn(pblk, "padding more than min. sectors\n");
|
||||
|
||||
atomic64_add(pad, &pblk->pad_wa);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(pad, &pblk->padded_writes);
|
||||
#endif
|
||||
|
||||
@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
|
||||
int ret = 1;
|
||||
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Caller must ensure that the access will not cause an overflow */
|
||||
BUG_ON(pos >= rb->nr_entries);
|
||||
#endif
|
||||
@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
||||
rb->subm,
|
||||
rb->sync,
|
||||
rb->l2p_update,
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_read(&rb->inflight_flush_point),
|
||||
#else
|
||||
0,
|
||||
@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
|
||||
rb->subm,
|
||||
rb->sync,
|
||||
rb->l2p_update,
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_read(&rb->inflight_flush_point),
|
||||
#else
|
||||
0,
|
||||
|
@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
|
||||
sector_t lba, struct ppa_addr ppa,
|
||||
int bio_iter, bool advanced_bio)
|
||||
{
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Callers must ensure that the ppa points to a cache address */
|
||||
BUG_ON(pblk_ppa_empty(ppa));
|
||||
BUG_ON(!pblk_addr_in_cache(ppa));
|
||||
@ -79,7 +79,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
WARN_ON(test_and_set_bit(i, read_bitmap));
|
||||
meta_list[i].lba = cpu_to_le64(lba);
|
||||
advanced_bio = true;
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_inc(&pblk->cache_reads);
|
||||
#endif
|
||||
} else {
|
||||
@ -97,7 +97,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
else
|
||||
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(nr_secs, &pblk->inflight_reads);
|
||||
#endif
|
||||
}
|
||||
@ -117,13 +117,13 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
continue;
|
||||
|
||||
if (lba != blba + i) {
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
struct ppa_addr *p;
|
||||
|
||||
p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
|
||||
print_ppa(&pblk->dev->geo, p, "seq", i);
|
||||
print_ppa(pblk, p, "seq", i);
|
||||
#endif
|
||||
pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
|
||||
pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
|
||||
lba, (u64)blba + i);
|
||||
WARN_ON(1);
|
||||
}
|
||||
@ -149,14 +149,14 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
meta_lba = le64_to_cpu(meta_lba_list[j].lba);
|
||||
|
||||
if (lba != meta_lba) {
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
struct ppa_addr *p;
|
||||
int nr_ppas = rqd->nr_ppas;
|
||||
|
||||
p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
|
||||
print_ppa(&pblk->dev->geo, p, "seq", j);
|
||||
print_ppa(pblk, p, "seq", j);
|
||||
#endif
|
||||
pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
|
||||
pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
|
||||
lba, meta_lba);
|
||||
WARN_ON(1);
|
||||
}
|
||||
@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
||||
static void pblk_end_user_read(struct bio *bio)
|
||||
{
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
|
||||
#endif
|
||||
bio_endio(bio);
|
||||
@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
struct bio *int_bio = rqd->bio;
|
||||
unsigned long start_time = r_ctx->start_time;
|
||||
|
||||
generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
|
||||
generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
|
||||
|
||||
if (rqd->error)
|
||||
pblk_log_read_err(pblk, rqd);
|
||||
@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
if (put_line)
|
||||
pblk_read_put_rqd_kref(pblk, rqd);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
|
||||
atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
|
||||
#endif
|
||||
@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
|
||||
__pblk_end_io_read(pblk, rqd, true);
|
||||
}
|
||||
|
||||
static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
struct bio *orig_bio, unsigned int bio_init_idx,
|
||||
unsigned long *read_bitmap)
|
||||
static void pblk_end_partial_read(struct nvm_rq *rqd)
|
||||
{
|
||||
struct pblk_sec_meta *meta_list = rqd->meta_list;
|
||||
struct bio *new_bio;
|
||||
struct pblk *pblk = rqd->private;
|
||||
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
|
||||
struct pblk_pr_ctx *pr_ctx = r_ctx->private;
|
||||
struct bio *new_bio = rqd->bio;
|
||||
struct bio *bio = pr_ctx->orig_bio;
|
||||
struct bio_vec src_bv, dst_bv;
|
||||
void *ppa_ptr = NULL;
|
||||
void *src_p, *dst_p;
|
||||
dma_addr_t dma_ppa_list = 0;
|
||||
__le64 *lba_list_mem, *lba_list_media;
|
||||
int nr_secs = rqd->nr_ppas;
|
||||
struct pblk_sec_meta *meta_list = rqd->meta_list;
|
||||
int bio_init_idx = pr_ctx->bio_init_idx;
|
||||
unsigned long *read_bitmap = pr_ctx->bitmap;
|
||||
int nr_secs = pr_ctx->orig_nr_secs;
|
||||
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
|
||||
int i, ret, hole;
|
||||
|
||||
/* Re-use allocated memory for intermediate lbas */
|
||||
lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
|
||||
lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
|
||||
|
||||
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
|
||||
|
||||
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
|
||||
goto fail_add_pages;
|
||||
|
||||
if (nr_holes != new_bio->bi_vcnt) {
|
||||
pr_err("pblk: malformed bio\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
for (i = 0; i < nr_secs; i++)
|
||||
lba_list_mem[i] = meta_list[i].lba;
|
||||
|
||||
new_bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
|
||||
|
||||
rqd->bio = new_bio;
|
||||
rqd->nr_ppas = nr_holes;
|
||||
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
|
||||
|
||||
if (unlikely(nr_holes == 1)) {
|
||||
ppa_ptr = rqd->ppa_list;
|
||||
dma_ppa_list = rqd->dma_ppa_list;
|
||||
rqd->ppa_addr = rqd->ppa_list[0];
|
||||
}
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
if (ret) {
|
||||
bio_put(rqd->bio);
|
||||
pr_err("pblk: sync read IO submission failed\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (rqd->error) {
|
||||
atomic_long_inc(&pblk->read_failed);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
pblk_print_failed_rqd(pblk, rqd, rqd->error);
|
||||
#endif
|
||||
}
|
||||
__le64 *lba_list_mem, *lba_list_media;
|
||||
void *src_p, *dst_p;
|
||||
int hole, i;
|
||||
|
||||
if (unlikely(nr_holes == 1)) {
|
||||
struct ppa_addr ppa;
|
||||
|
||||
ppa = rqd->ppa_addr;
|
||||
rqd->ppa_list = ppa_ptr;
|
||||
rqd->dma_ppa_list = dma_ppa_list;
|
||||
rqd->ppa_list = pr_ctx->ppa_ptr;
|
||||
rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
|
||||
rqd->ppa_list[0] = ppa;
|
||||
}
|
||||
|
||||
/* Re-use allocated memory for intermediate lbas */
|
||||
lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
|
||||
lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
|
||||
|
||||
for (i = 0; i < nr_secs; i++) {
|
||||
lba_list_media[i] = meta_list[i].lba;
|
||||
meta_list[i].lba = lba_list_mem[i];
|
||||
@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
meta_list[hole].lba = lba_list_media[i];
|
||||
|
||||
src_bv = new_bio->bi_io_vec[i++];
|
||||
dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
|
||||
dst_bv = bio->bi_io_vec[bio_init_idx + hole];
|
||||
|
||||
src_p = kmap_atomic(src_bv.bv_page);
|
||||
dst_p = kmap_atomic(dst_bv.bv_page);
|
||||
@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
} while (hole < nr_secs);
|
||||
|
||||
bio_put(new_bio);
|
||||
kfree(pr_ctx);
|
||||
|
||||
/* restore original request */
|
||||
rqd->bio = NULL;
|
||||
rqd->nr_ppas = nr_secs;
|
||||
|
||||
bio_endio(bio);
|
||||
__pblk_end_io_read(pblk, rqd, false);
|
||||
return NVM_IO_DONE;
|
||||
}
|
||||
|
||||
fail:
|
||||
/* Free allocated pages in new bio */
|
||||
static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
unsigned int bio_init_idx,
|
||||
unsigned long *read_bitmap,
|
||||
int nr_holes)
|
||||
{
|
||||
struct pblk_sec_meta *meta_list = rqd->meta_list;
|
||||
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
|
||||
struct pblk_pr_ctx *pr_ctx;
|
||||
struct bio *new_bio, *bio = r_ctx->private;
|
||||
__le64 *lba_list_mem;
|
||||
int nr_secs = rqd->nr_ppas;
|
||||
int i;
|
||||
|
||||
/* Re-use allocated memory for intermediate lbas */
|
||||
lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
|
||||
|
||||
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
|
||||
|
||||
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
|
||||
goto fail_bio_put;
|
||||
|
||||
if (nr_holes != new_bio->bi_vcnt) {
|
||||
WARN_ONCE(1, "pblk: malformed bio\n");
|
||||
goto fail_free_pages;
|
||||
}
|
||||
|
||||
pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
|
||||
if (!pr_ctx)
|
||||
goto fail_free_pages;
|
||||
|
||||
for (i = 0; i < nr_secs; i++)
|
||||
lba_list_mem[i] = meta_list[i].lba;
|
||||
|
||||
new_bio->bi_iter.bi_sector = 0; /* internal bio */
|
||||
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
|
||||
|
||||
rqd->bio = new_bio;
|
||||
rqd->nr_ppas = nr_holes;
|
||||
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
|
||||
|
||||
pr_ctx->ppa_ptr = NULL;
|
||||
pr_ctx->orig_bio = bio;
|
||||
bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
|
||||
pr_ctx->bio_init_idx = bio_init_idx;
|
||||
pr_ctx->orig_nr_secs = nr_secs;
|
||||
r_ctx->private = pr_ctx;
|
||||
|
||||
if (unlikely(nr_holes == 1)) {
|
||||
pr_ctx->ppa_ptr = rqd->ppa_list;
|
||||
pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
|
||||
rqd->ppa_addr = rqd->ppa_list[0];
|
||||
}
|
||||
return 0;
|
||||
|
||||
fail_free_pages:
|
||||
pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
|
||||
fail_add_pages:
|
||||
pr_err("pblk: failed to perform partial read\n");
|
||||
fail_bio_put:
|
||||
bio_put(new_bio);
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
unsigned int bio_init_idx,
|
||||
unsigned long *read_bitmap, int nr_secs)
|
||||
{
|
||||
int nr_holes;
|
||||
int ret;
|
||||
|
||||
nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
|
||||
|
||||
if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
|
||||
nr_holes))
|
||||
return NVM_IO_ERR;
|
||||
|
||||
rqd->end_io = pblk_end_partial_read;
|
||||
|
||||
ret = pblk_submit_io(pblk, rqd);
|
||||
if (ret) {
|
||||
bio_put(rqd->bio);
|
||||
pblk_err(pblk, "partial read IO submission failed\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
return NVM_IO_OK;
|
||||
|
||||
err:
|
||||
pblk_err(pblk, "failed to perform partial read\n");
|
||||
|
||||
/* Free allocated pages in new bio */
|
||||
pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
|
||||
__pblk_end_io_read(pblk, rqd, false);
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
@ -359,7 +409,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
|
||||
|
||||
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_inc(&pblk->inflight_reads);
|
||||
#endif
|
||||
|
||||
@ -382,7 +432,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
|
||||
WARN_ON(test_and_set_bit(0, read_bitmap));
|
||||
meta_list[0].lba = cpu_to_le64(lba);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_inc(&pblk->cache_reads);
|
||||
#endif
|
||||
} else {
|
||||
@ -401,7 +451,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
struct pblk_g_ctx *r_ctx;
|
||||
struct nvm_rq *rqd;
|
||||
unsigned int bio_init_idx;
|
||||
unsigned long read_bitmap; /* Max 64 ppas per request */
|
||||
DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
|
||||
int ret = NVM_IO_ERR;
|
||||
|
||||
/* logic error: lba out-of-bounds. Ignore read request */
|
||||
@ -411,9 +461,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
|
||||
generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
|
||||
&pblk->disk->part0);
|
||||
|
||||
bitmap_zero(&read_bitmap, nr_secs);
|
||||
bitmap_zero(read_bitmap, nr_secs);
|
||||
|
||||
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
|
||||
|
||||
@ -436,7 +487,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
|
||||
&rqd->dma_meta_list);
|
||||
if (!rqd->meta_list) {
|
||||
pr_err("pblk: not able to allocate ppa list\n");
|
||||
pblk_err(pblk, "not able to allocate ppa list\n");
|
||||
goto fail_rqd_free;
|
||||
}
|
||||
|
||||
@ -444,32 +495,32 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
|
||||
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
|
||||
|
||||
pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
|
||||
pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
|
||||
} else {
|
||||
pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
|
||||
pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
|
||||
}
|
||||
|
||||
if (bitmap_full(&read_bitmap, nr_secs)) {
|
||||
if (bitmap_full(read_bitmap, nr_secs)) {
|
||||
atomic_inc(&pblk->inflight_io);
|
||||
__pblk_end_io_read(pblk, rqd, false);
|
||||
return NVM_IO_DONE;
|
||||
}
|
||||
|
||||
/* All sectors are to be read from the device */
|
||||
if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
|
||||
if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
|
||||
struct bio *int_bio = NULL;
|
||||
|
||||
/* Clone read bio to deal with read errors internally */
|
||||
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
|
||||
if (!int_bio) {
|
||||
pr_err("pblk: could not clone read bio\n");
|
||||
pblk_err(pblk, "could not clone read bio\n");
|
||||
goto fail_end_io;
|
||||
}
|
||||
|
||||
rqd->bio = int_bio;
|
||||
|
||||
if (pblk_submit_io(pblk, rqd)) {
|
||||
pr_err("pblk: read IO submission failed\n");
|
||||
pblk_err(pblk, "read IO submission failed\n");
|
||||
ret = NVM_IO_ERR;
|
||||
goto fail_end_io;
|
||||
}
|
||||
@ -480,8 +531,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
|
||||
/* The read bio request could be partially filled by the write buffer,
|
||||
* but there are some holes that need to be read from the drive.
|
||||
*/
|
||||
return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
|
||||
ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
|
||||
nr_secs);
|
||||
if (ret)
|
||||
goto fail_meta_free;
|
||||
|
||||
return NVM_IO_OK;
|
||||
|
||||
fail_meta_free:
|
||||
nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
|
||||
fail_rqd_free:
|
||||
pblk_free_rqd(pblk, rqd, PBLK_READ);
|
||||
return ret;
|
||||
@ -514,7 +572,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(valid_secs, &pblk->inflight_reads);
|
||||
#endif
|
||||
|
||||
@ -548,7 +606,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
rqd->ppa_addr = ppa_l2p;
|
||||
valid_secs = 1;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_inc(&pblk->inflight_reads);
|
||||
#endif
|
||||
|
||||
@ -595,7 +653,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
||||
bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
|
||||
PBLK_VMALLOC_META, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
|
||||
pblk_err(pblk, "could not allocate GC bio (%lu)\n",
|
||||
PTR_ERR(bio));
|
||||
goto err_free_dma;
|
||||
}
|
||||
|
||||
@ -609,7 +668,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
||||
|
||||
if (pblk_submit_io_sync(pblk, &rqd)) {
|
||||
ret = -EIO;
|
||||
pr_err("pblk: GC read request failed\n");
|
||||
pblk_err(pblk, "GC read request failed\n");
|
||||
goto err_free_bio;
|
||||
}
|
||||
|
||||
@ -619,12 +678,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
|
||||
|
||||
if (rqd.error) {
|
||||
atomic_long_inc(&pblk->read_failed_gc);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
pblk_print_failed_rqd(pblk, &rqd, rqd.error);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
|
||||
atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
|
||||
atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
|
||||
|
@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
|
||||
}
|
||||
|
||||
if (nr_valid_lbas != nr_lbas)
|
||||
pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
|
||||
pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
|
||||
line->id, nr_valid_lbas, nr_lbas);
|
||||
|
||||
line->left_msecs = 0;
|
||||
@ -184,7 +184,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
/* If read fails, more padding is needed */
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -194,7 +194,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
* we cannot recover from here. Need FTL log.
|
||||
*/
|
||||
if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
|
||||
pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
|
||||
pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
next_pad_rq:
|
||||
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
|
||||
if (rq_ppas < pblk->min_write_pgs) {
|
||||
pr_err("pblk: corrupted pad line %d\n", line->id);
|
||||
pblk_err(pblk, "corrupted pad line %d\n", line->id);
|
||||
goto fail_free_pad;
|
||||
}
|
||||
|
||||
@ -342,7 +342,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
ret = pblk_submit_io(pblk, rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
|
||||
goto fail_free_bio;
|
||||
}
|
||||
@ -356,12 +356,12 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
if (!wait_for_completion_io_timeout(&pad_rq->wait,
|
||||
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
|
||||
pr_err("pblk: pad write timed out\n");
|
||||
pblk_err(pblk, "pad write timed out\n");
|
||||
ret = -ETIME;
|
||||
}
|
||||
|
||||
if (!pblk_line_is_full(line))
|
||||
pr_err("pblk: corrupted padded line: %d\n", line->id);
|
||||
pblk_err(pblk, "corrupted padded line: %d\n", line->id);
|
||||
|
||||
vfree(data);
|
||||
free_rq:
|
||||
@ -461,7 +461,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -501,11 +501,11 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
ret = pblk_recov_pad_oob(pblk, line, pad_secs);
|
||||
if (ret)
|
||||
pr_err("pblk: OOB padding failed (err:%d)\n", ret);
|
||||
pblk_err(pblk, "OOB padding failed (err:%d)\n", ret);
|
||||
|
||||
ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
|
||||
if (ret)
|
||||
pr_err("pblk: OOB read failed (err:%d)\n", ret);
|
||||
pblk_err(pblk, "OOB read failed (err:%d)\n", ret);
|
||||
|
||||
left_ppas = 0;
|
||||
}
|
||||
@ -592,7 +592,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
|
||||
|
||||
ret = pblk_submit_io_sync(pblk, rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "I/O submission failed: %d\n", ret);
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
|
||||
|
||||
ret = pblk_recov_scan_oob(pblk, line, p, &done);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not recover L2P from OOB\n");
|
||||
pblk_err(pblk, "could not recover L2P from OOB\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!done) {
|
||||
ret = pblk_recov_scan_all_oob(pblk, line, p);
|
||||
if (ret) {
|
||||
pr_err("pblk: could not recover L2P from OOB\n");
|
||||
pblk_err(pblk, "could not recover L2P from OOB\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -737,14 +737,15 @@ static int pblk_recov_check_line_version(struct pblk *pblk,
|
||||
struct line_header *header = &emeta->header;
|
||||
|
||||
if (header->version_major != EMETA_VERSION_MAJOR) {
|
||||
pr_err("pblk: line major version mismatch: %d, expected: %d\n",
|
||||
pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
|
||||
header->version_major, EMETA_VERSION_MAJOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
if (header->version_minor > EMETA_VERSION_MINOR)
|
||||
pr_info("pblk: newer line minor version found: %d\n", line_v);
|
||||
pblk_info(pblk, "newer line minor version found: %d\n",
|
||||
header->version_minor);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
@ -851,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
continue;
|
||||
|
||||
if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
|
||||
pr_err("pblk: found incompatible line version %u\n",
|
||||
pblk_err(pblk, "found incompatible line version %u\n",
|
||||
smeta_buf->header.version_major);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
@ -863,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
}
|
||||
|
||||
if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
|
||||
pr_debug("pblk: ignore line %u due to uuid mismatch\n",
|
||||
pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
|
||||
i);
|
||||
continue;
|
||||
}
|
||||
@ -887,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
|
||||
pblk_recov_line_add_ordered(&recov_list, line);
|
||||
found_lines++;
|
||||
pr_debug("pblk: recovering data line %d, seq:%llu\n",
|
||||
pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
|
||||
line->id, smeta_buf->seq_nr);
|
||||
}
|
||||
|
||||
@ -947,7 +948,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
line->emeta = NULL;
|
||||
} else {
|
||||
if (open_lines > 1)
|
||||
pr_err("pblk: failed to recover L2P\n");
|
||||
pblk_err(pblk, "failed to recover L2P\n");
|
||||
|
||||
open_lines++;
|
||||
line->meta_line = meta_line;
|
||||
@ -976,7 +977,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
|
||||
|
||||
out:
|
||||
if (found_lines != recovered_lines)
|
||||
pr_err("pblk: failed to recover all found lines %d/%d\n",
|
||||
pblk_err(pblk, "failed to recover all found lines %d/%d\n",
|
||||
found_lines, recovered_lines);
|
||||
|
||||
return data_line;
|
||||
@ -999,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk)
|
||||
|
||||
ret = pblk_recov_pad_oob(pblk, line, left_msecs);
|
||||
if (ret) {
|
||||
pr_err("pblk: Tear down padding failed (%d)\n", ret);
|
||||
pblk_err(pblk, "tear down padding failed (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
|
||||
spin_unlock(&l_mg->free_lock);
|
||||
|
||||
if (nr_free_lines != free_line_cnt)
|
||||
pr_err("pblk: corrupted free line list:%d/%d\n",
|
||||
pblk_err(pblk, "corrupted free line list:%d/%d\n",
|
||||
nr_free_lines, free_line_cnt);
|
||||
|
||||
sz = snprintf(page, PAGE_SIZE - sz,
|
||||
@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
|
||||
return sz;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE,
|
||||
@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = {
|
||||
.mode = 0644,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
static struct attribute sys_stats_debug_attr = {
|
||||
.name = "stats",
|
||||
.mode = 0444,
|
||||
@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = {
|
||||
&sys_write_amp_mileage,
|
||||
&sys_write_amp_trip,
|
||||
&sys_padding_dist,
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
&sys_stats_debug_attr,
|
||||
#endif
|
||||
NULL,
|
||||
@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
|
||||
return pblk_sysfs_get_write_amp_trip(pblk, buf);
|
||||
else if (strcmp(attr->name, "padding_dist") == 0)
|
||||
return pblk_sysfs_get_padding_dist(pblk, buf);
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
else if (strcmp(attr->name, "stats") == 0)
|
||||
return pblk_sysfs_stats_debug(pblk, buf);
|
||||
#endif
|
||||
@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk)
|
||||
kobject_get(&parent_dev->kobj),
|
||||
"%s", "pblk");
|
||||
if (ret) {
|
||||
pr_err("pblk: could not register %s/pblk\n",
|
||||
tdisk->disk_name);
|
||||
pblk_err(pblk, "could not register\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
/* Release flags on context. Protect from writes */
|
||||
smp_store_release(&w_ctx->flags, flags);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_dec(&rwb->inflight_flush_point);
|
||||
#endif
|
||||
}
|
||||
@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
|
||||
c_ctx->nr_padded);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
|
||||
#endif
|
||||
|
||||
@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
unsigned long flags;
|
||||
unsigned long pos;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
|
||||
#endif
|
||||
|
||||
@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
|
||||
list_add_tail(&r_ctx->list, &pblk->resubmit_list);
|
||||
spin_unlock(&pblk->resubmit_lock);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
|
||||
#endif
|
||||
}
|
||||
@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
||||
recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
|
||||
if (!recovery) {
|
||||
pr_err("pblk: could not allocate recovery work\n");
|
||||
pblk_err(pblk, "could not allocate recovery work\n");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
|
||||
pblk_end_w_fail(pblk, rqd);
|
||||
return;
|
||||
}
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
else
|
||||
WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
|
||||
#endif
|
||||
@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
|
||||
|
||||
if (rqd->error) {
|
||||
pblk_log_write_err(pblk, rqd);
|
||||
pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
|
||||
pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
|
||||
line->w_err_gc->has_write_err = 1;
|
||||
}
|
||||
|
||||
@ -356,11 +356,11 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
|
||||
|
||||
secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
if ((!secs_to_sync && secs_to_flush)
|
||||
|| (secs_to_sync < 0)
|
||||
|| (secs_to_sync > secs_avail && !secs_to_flush)) {
|
||||
pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
|
||||
pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
|
||||
secs_avail, secs_to_sync, secs_to_flush);
|
||||
}
|
||||
#endif
|
||||
@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
||||
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
|
||||
l_mg->emeta_alloc_type, GFP_KERNEL);
|
||||
if (IS_ERR(bio)) {
|
||||
pr_err("pblk: failed to map emeta io");
|
||||
pblk_err(pblk, "failed to map emeta io");
|
||||
ret = PTR_ERR(bio);
|
||||
goto fail_free_rqd;
|
||||
}
|
||||
@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
|
||||
|
||||
ret = pblk_submit_io(pblk, rqd);
|
||||
if (ret) {
|
||||
pr_err("pblk: emeta I/O submission failed: %d\n", ret);
|
||||
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
|
||||
goto fail_rollback;
|
||||
}
|
||||
|
||||
@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
/* Assign lbas to ppas and populate request structure */
|
||||
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
|
||||
if (err) {
|
||||
pr_err("pblk: could not setup write request: %d\n", err);
|
||||
pblk_err(pblk, "could not setup write request: %d\n", err);
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
/* Submit data write for current data line */
|
||||
err = pblk_submit_io(pblk, rqd);
|
||||
if (err) {
|
||||
pr_err("pblk: data I/O submission failed: %d\n", err);
|
||||
pblk_err(pblk, "data I/O submission failed: %d\n", err);
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
|
||||
@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
/* Submit metadata write for previous data line */
|
||||
err = pblk_submit_meta_io(pblk, meta_line);
|
||||
if (err) {
|
||||
pr_err("pblk: metadata I/O submission failed: %d", err);
|
||||
pblk_err(pblk, "metadata I/O submission failed: %d",
|
||||
err);
|
||||
return NVM_IO_ERR;
|
||||
}
|
||||
}
|
||||
@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk)
|
||||
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
|
||||
secs_to_flush);
|
||||
if (secs_to_sync > pblk->max_write_pgs) {
|
||||
pr_err("pblk: bad buffer sync calculation\n");
|
||||
pblk_err(pblk, "bad buffer sync calculation\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -633,14 +634,14 @@ static int pblk_submit_write(struct pblk *pblk)
|
||||
|
||||
if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
|
||||
secs_avail)) {
|
||||
pr_err("pblk: corrupted write bio\n");
|
||||
pblk_err(pblk, "corrupted write bio\n");
|
||||
goto fail_put_bio;
|
||||
}
|
||||
|
||||
if (pblk_submit_io_set(pblk, rqd))
|
||||
goto fail_free_bio;
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_long_add(secs_to_sync, &pblk->sub_writes);
|
||||
#endif
|
||||
|
||||
|
@ -119,6 +119,16 @@ struct pblk_g_ctx {
|
||||
u64 lba;
|
||||
};
|
||||
|
||||
/* partial read context */
|
||||
struct pblk_pr_ctx {
|
||||
struct bio *orig_bio;
|
||||
DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
|
||||
unsigned int orig_nr_secs;
|
||||
unsigned int bio_init_idx;
|
||||
void *ppa_ptr;
|
||||
dma_addr_t dma_ppa_list;
|
||||
};
|
||||
|
||||
/* Pad context */
|
||||
struct pblk_pad_rq {
|
||||
struct pblk *pblk;
|
||||
@ -193,7 +203,7 @@ struct pblk_rb {
|
||||
spinlock_t w_lock; /* Write lock */
|
||||
spinlock_t s_lock; /* Sync lock */
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
|
||||
#endif
|
||||
};
|
||||
@ -608,9 +618,6 @@ struct pblk {
|
||||
|
||||
int min_write_pgs; /* Minimum amount of pages required by controller */
|
||||
int max_write_pgs; /* Maximum amount of pages supported by controller */
|
||||
int pgs_in_buffer; /* Number of pages that need to be held in buffer to
|
||||
* guarantee successful reads.
|
||||
*/
|
||||
|
||||
sector_t capacity; /* Device capacity when bad blocks are subtracted */
|
||||
|
||||
@ -639,7 +646,7 @@ struct pblk {
|
||||
u64 nr_flush_rst; /* Flushes reset value for pad dist.*/
|
||||
atomic64_t nr_flush; /* Number of flush/fua I/O */
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
/* Non-persistent debug counters, 4kb sector I/Os */
|
||||
atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
|
||||
atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
|
||||
@ -706,6 +713,15 @@ struct pblk_line_ws {
|
||||
#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
|
||||
#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
|
||||
|
||||
#define pblk_err(pblk, fmt, ...) \
|
||||
pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
|
||||
#define pblk_info(pblk, fmt, ...) \
|
||||
pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
|
||||
#define pblk_warn(pblk, fmt, ...) \
|
||||
pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
|
||||
#define pblk_debug(pblk, fmt, ...) \
|
||||
pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
|
||||
|
||||
/*
|
||||
* pblk ring buffer operations
|
||||
*/
|
||||
@ -1282,20 +1298,22 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
|
||||
return !(nr_secs % pblk->min_write_pgs);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVM_DEBUG
|
||||
static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p,
|
||||
#ifdef CONFIG_NVM_PBLK_DEBUG
|
||||
static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
|
||||
char *msg, int error)
|
||||
{
|
||||
struct nvm_geo *geo = &pblk->dev->geo;
|
||||
|
||||
if (p->c.is_cached) {
|
||||
pr_err("ppa: (%s: %x) cache line: %llu\n",
|
||||
pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
|
||||
msg, error, (u64)p->c.line);
|
||||
} else if (geo->version == NVM_OCSSD_SPEC_12) {
|
||||
pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
|
||||
pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
|
||||
msg, error,
|
||||
p->g.ch, p->g.lun, p->g.blk,
|
||||
p->g.pg, p->g.pl, p->g.sec);
|
||||
} else {
|
||||
pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
|
||||
pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
|
||||
msg, error,
|
||||
p->m.grp, p->m.pu, p->m.chk, p->m.sec);
|
||||
}
|
||||
@ -1307,16 +1325,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
|
||||
int bit = -1;
|
||||
|
||||
if (rqd->nr_ppas == 1) {
|
||||
print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error);
|
||||
print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
|
||||
return;
|
||||
}
|
||||
|
||||
while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
|
||||
bit + 1)) < rqd->nr_ppas) {
|
||||
print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error);
|
||||
print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
|
||||
}
|
||||
|
||||
pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
|
||||
pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
|
||||
}
|
||||
|
||||
static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
|
||||
@ -1347,7 +1365,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
|
||||
continue;
|
||||
}
|
||||
|
||||
print_ppa(geo, ppa, "boundary", i);
|
||||
print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -1377,7 +1395,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
|
||||
|
||||
spin_lock(&line->lock);
|
||||
if (line->state != PBLK_LINESTATE_OPEN) {
|
||||
pr_err("pblk: bad ppa: line:%d,state:%d\n",
|
||||
pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
|
||||
line->id, line->state);
|
||||
WARN_ON(1);
|
||||
spin_unlock(&line->lock);
|
||||
|
@ -328,13 +328,6 @@ struct cached_dev {
|
||||
*/
|
||||
atomic_t has_dirty;
|
||||
|
||||
/*
|
||||
* Set to zero by things that touch the backing volume-- except
|
||||
* writeback. Incremented by writeback. Used to determine when to
|
||||
* accelerate idle writeback.
|
||||
*/
|
||||
atomic_t backing_idle;
|
||||
|
||||
struct bch_ratelimit writeback_rate;
|
||||
struct delayed_work writeback_rate_update;
|
||||
|
||||
@ -423,9 +416,9 @@ struct cache {
|
||||
/*
|
||||
* When allocating new buckets, prio_write() gets first dibs - since we
|
||||
* may not be allocate at all without writing priorities and gens.
|
||||
* prio_buckets[] contains the last buckets we wrote priorities to (so
|
||||
* gc can mark them as metadata), prio_next[] contains the buckets
|
||||
* allocated for the next prio write.
|
||||
* prio_last_buckets[] contains the last buckets we wrote priorities to
|
||||
* (so gc can mark them as metadata), prio_buckets[] contains the
|
||||
* buckets allocated for the next prio write.
|
||||
*/
|
||||
uint64_t *prio_buckets;
|
||||
uint64_t *prio_last_buckets;
|
||||
@ -474,6 +467,7 @@ struct cache {
|
||||
|
||||
struct gc_stat {
|
||||
size_t nodes;
|
||||
size_t nodes_pre;
|
||||
size_t key_bytes;
|
||||
|
||||
size_t nkeys;
|
||||
@ -514,6 +508,8 @@ struct cache_set {
|
||||
struct cache_accounting accounting;
|
||||
|
||||
unsigned long flags;
|
||||
atomic_t idle_counter;
|
||||
atomic_t at_max_writeback_rate;
|
||||
|
||||
struct cache_sb sb;
|
||||
|
||||
@ -523,8 +519,10 @@ struct cache_set {
|
||||
|
||||
struct bcache_device **devices;
|
||||
unsigned devices_max_used;
|
||||
atomic_t attached_dev_nr;
|
||||
struct list_head cached_devs;
|
||||
uint64_t cached_dev_sectors;
|
||||
atomic_long_t flash_dev_dirty_sectors;
|
||||
struct closure caching;
|
||||
|
||||
struct closure sb_write;
|
||||
@ -602,6 +600,10 @@ struct cache_set {
|
||||
* rescale; when it hits 0 we rescale all the bucket priorities.
|
||||
*/
|
||||
atomic_t rescale;
|
||||
/*
|
||||
* used for GC, identify if any front side I/Os is inflight
|
||||
*/
|
||||
atomic_t search_inflight;
|
||||
/*
|
||||
* When we invalidate buckets, we use both the priority and the amount
|
||||
* of good data to determine which buckets to reuse first - to weight
|
||||
@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *);
|
||||
int bch_cache_allocator_start(struct cache *ca);
|
||||
|
||||
void bch_debug_exit(void);
|
||||
int bch_debug_init(struct kobject *);
|
||||
void bch_debug_init(struct kobject *kobj);
|
||||
void bch_request_exit(void);
|
||||
int bch_request_init(void);
|
||||
|
||||
|
@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init);
|
||||
|
||||
/* Binary tree stuff for auxiliary search trees */
|
||||
|
||||
/*
|
||||
* return array index next to j when does in-order traverse
|
||||
* of a binary tree which is stored in a linear array
|
||||
*/
|
||||
static unsigned inorder_next(unsigned j, unsigned size)
|
||||
{
|
||||
if (j * 2 + 1 < size) {
|
||||
@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size)
|
||||
return j;
|
||||
}
|
||||
|
||||
/*
|
||||
* return array index previous to j when does in-order traverse
|
||||
* of a binary tree which is stored in a linear array
|
||||
*/
|
||||
static unsigned inorder_prev(unsigned j, unsigned size)
|
||||
{
|
||||
if (j * 2 < size) {
|
||||
@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
|
||||
return j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the cacheline index in bset_tree->data, where j is index
|
||||
* from a linear array which stores the auxiliar binary tree
|
||||
*/
|
||||
static unsigned to_inorder(unsigned j, struct bset_tree *t)
|
||||
{
|
||||
return __to_inorder(j, t->size, t->extra);
|
||||
@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
|
||||
return j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an index from a linear array which stores the auxiliar binary
|
||||
* tree, j is the cacheline index of t->data.
|
||||
*/
|
||||
static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
|
||||
{
|
||||
return __inorder_to_tree(j, t->size, t->extra);
|
||||
@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
|
||||
return low;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate mantissa value for struct bkey_float.
|
||||
* If most significant bit of f->exponent is not set, then
|
||||
* - f->exponent >> 6 is 0
|
||||
* - p[0] points to bkey->low
|
||||
* - p[-1] borrows bits from KEY_INODE() of bkey->high
|
||||
* if most isgnificant bits of f->exponent is set, then
|
||||
* - f->exponent >> 6 is 1
|
||||
* - p[0] points to bits from KEY_INODE() of bkey->high
|
||||
* - p[-1] points to other bits from KEY_INODE() of
|
||||
* bkey->high too.
|
||||
* See make_bfloat() to check when most significant bit of f->exponent
|
||||
* is set or not.
|
||||
*/
|
||||
static inline unsigned bfloat_mantissa(const struct bkey *k,
|
||||
struct bkey_float *f)
|
||||
{
|
||||
@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
|
||||
BUG_ON(m < l || m > r);
|
||||
BUG_ON(bkey_next(p) != m);
|
||||
|
||||
/*
|
||||
* If l and r have different KEY_INODE values (different backing
|
||||
* device), f->exponent records how many least significant bits
|
||||
* are different in KEY_INODE values and sets most significant
|
||||
* bits to 1 (by +64).
|
||||
* If l and r have same KEY_INODE value, f->exponent records
|
||||
* how many different bits in least significant bits of bkey->low.
|
||||
* See bfloat_mantiss() how the most significant bit of
|
||||
* f->exponent is used to calculate bfloat mantissa value.
|
||||
*/
|
||||
if (KEY_INODE(l) != KEY_INODE(r))
|
||||
f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
|
||||
else
|
||||
@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
|
||||
}
|
||||
EXPORT_SYMBOL(bch_bset_init_next);
|
||||
|
||||
/*
|
||||
* Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
|
||||
* accelerate bkey search in a btree node (pointed by bset_tree->data in
|
||||
* memory). After search in the auxiliar tree by calling bset_search_tree(),
|
||||
* a struct bset_search_iter is returned which indicates range [l, r] from
|
||||
* bset_tree->data where the searching bkey might be inside. Then a followed
|
||||
* linear comparison does the exact search, see __bch_bset_search() for how
|
||||
* the auxiliary tree is used.
|
||||
*/
|
||||
void bch_bset_build_written_tree(struct btree_keys *b)
|
||||
{
|
||||
struct bset_tree *t = bset_tree_last(b);
|
||||
@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
|
||||
unsigned inorder, j, n = 1;
|
||||
|
||||
do {
|
||||
/*
|
||||
* A bit trick here.
|
||||
* If p < t->size, (int)(p - t->size) is a minus value and
|
||||
* the most significant bit is set, right shifting 31 bits
|
||||
* gets 1. If p >= t->size, the most significant bit is
|
||||
* not set, right shifting 31 bits gets 0.
|
||||
* So the following 2 lines equals to
|
||||
* if (p >= t->size)
|
||||
* p = 0;
|
||||
* but a branch instruction is avoided.
|
||||
*/
|
||||
unsigned p = n << 4;
|
||||
p &= ((int) (p - t->size)) >> 31;
|
||||
|
||||
@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
|
||||
f = &t->tree[j];
|
||||
|
||||
/*
|
||||
* Similar bit trick, use subtract operation to avoid a branch
|
||||
* instruction.
|
||||
*
|
||||
* n = (f->mantissa > bfloat_mantissa())
|
||||
* ? j * 2
|
||||
* : j * 2 + 1;
|
||||
|
@ -90,6 +90,9 @@
|
||||
|
||||
#define MAX_NEED_GC 64
|
||||
#define MAX_SAVE_PRIO 72
|
||||
#define MAX_GC_TIMES 100
|
||||
#define MIN_GC_NODES 100
|
||||
#define GC_SLEEP_MS 100
|
||||
|
||||
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
|
||||
|
||||
@ -1008,6 +1011,13 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
|
||||
BUG_ON(b->level != level);
|
||||
}
|
||||
|
||||
if (btree_node_io_error(b)) {
|
||||
rw_unlock(write, b);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
BUG_ON(!b->written);
|
||||
|
||||
b->parent = parent;
|
||||
b->accessed = 1;
|
||||
|
||||
@ -1019,13 +1029,6 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
|
||||
for (; i <= b->keys.nsets; i++)
|
||||
prefetch(b->keys.set[i].data);
|
||||
|
||||
if (btree_node_io_error(b)) {
|
||||
rw_unlock(write, b);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
BUG_ON(!b->written);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static size_t btree_gc_min_nodes(struct cache_set *c)
|
||||
{
|
||||
size_t min_nodes;
|
||||
|
||||
/*
|
||||
* Since incremental GC would stop 100ms when front
|
||||
* side I/O comes, so when there are many btree nodes,
|
||||
* if GC only processes constant (100) nodes each time,
|
||||
* GC would last a long time, and the front side I/Os
|
||||
* would run out of the buckets (since no new bucket
|
||||
* can be allocated during GC), and be blocked again.
|
||||
* So GC should not process constant nodes, but varied
|
||||
* nodes according to the number of btree nodes, which
|
||||
* realized by dividing GC into constant(100) times,
|
||||
* so when there are many btree nodes, GC can process
|
||||
* more nodes each time, otherwise, GC will process less
|
||||
* nodes each time (but no less than MIN_GC_NODES)
|
||||
*/
|
||||
min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
|
||||
if (min_nodes < MIN_GC_NODES)
|
||||
min_nodes = MIN_GC_NODES;
|
||||
|
||||
return min_nodes;
|
||||
}
|
||||
|
||||
|
||||
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
|
||||
struct closure *writes, struct gc_stat *gc)
|
||||
{
|
||||
@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
|
||||
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
|
||||
r->b = NULL;
|
||||
|
||||
if (atomic_read(&b->c->search_inflight) &&
|
||||
gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
|
||||
gc->nodes_pre = gc->nodes;
|
||||
ret = -EAGAIN;
|
||||
break;
|
||||
}
|
||||
|
||||
if (need_resched()) {
|
||||
ret = -EAGAIN;
|
||||
break;
|
||||
@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
|
||||
closure_sync(&writes);
|
||||
cond_resched();
|
||||
|
||||
if (ret && ret != -EAGAIN)
|
||||
if (ret == -EAGAIN)
|
||||
schedule_timeout_interruptible(msecs_to_jiffies
|
||||
(GC_SLEEP_MS));
|
||||
else if (ret)
|
||||
pr_warn("gc failed!");
|
||||
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
|
||||
|
||||
@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
|
||||
do {
|
||||
k = bch_btree_iter_next_filter(&iter, &b->keys,
|
||||
bch_ptr_bad);
|
||||
if (k)
|
||||
if (k) {
|
||||
btree_node_prefetch(b, k);
|
||||
/*
|
||||
* initiallize c->gc_stats.nodes
|
||||
* for incremental GC
|
||||
*/
|
||||
b->c->gc_stats.nodes++;
|
||||
}
|
||||
|
||||
if (p)
|
||||
ret = btree(check_recurse, p, b, op);
|
||||
|
@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \
|
||||
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
\
|
||||
static inline void set_btree_node_ ## flag(struct btree *b) \
|
||||
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
{ set_bit(BTREE_NODE_ ## flag, &b->flags); }
|
||||
|
||||
enum btree_flags {
|
||||
BTREE_NODE_io_error,
|
||||
|
@ -199,11 +199,16 @@ static const struct file_operations debug_ops = {
|
||||
.release = single_release
|
||||
};
|
||||
|
||||
int __init closure_debug_init(void)
|
||||
void __init closure_debug_init(void)
|
||||
{
|
||||
closure_debug = debugfs_create_file("closures",
|
||||
0400, bcache_debug, NULL, &debug_ops);
|
||||
return IS_ERR_OR_NULL(closure_debug);
|
||||
if (!IS_ERR_OR_NULL(bcache_debug))
|
||||
/*
|
||||
* it is unnecessary to check return value of
|
||||
* debugfs_create_file(), we should not care
|
||||
* about this.
|
||||
*/
|
||||
closure_debug = debugfs_create_file(
|
||||
"closures", 0400, bcache_debug, NULL, &debug_ops);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
|
||||
|
||||
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
||||
|
||||
int closure_debug_init(void);
|
||||
void closure_debug_init(void);
|
||||
void closure_debug_create(struct closure *cl);
|
||||
void closure_debug_destroy(struct closure *cl);
|
||||
|
||||
#else
|
||||
|
||||
static inline int closure_debug_init(void) { return 0; }
|
||||
static inline void closure_debug_init(void) {}
|
||||
static inline void closure_debug_create(struct closure *cl) {}
|
||||
static inline void closure_debug_destroy(struct closure *cl) {}
|
||||
|
||||
|
@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
|
||||
struct bio_vec bv, cbv;
|
||||
struct bvec_iter iter, citer = { 0 };
|
||||
|
||||
check = bio_clone_kmalloc(bio, GFP_NOIO);
|
||||
check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
|
||||
if (!check)
|
||||
return;
|
||||
check->bi_disk = bio->bi_disk;
|
||||
check->bi_opf = REQ_OP_READ;
|
||||
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
|
||||
check->bi_iter.bi_size = bio->bi_iter.bi_size;
|
||||
|
||||
bch_bio_map(check, NULL);
|
||||
if (bch_bio_alloc_pages(check, GFP_NOIO))
|
||||
goto out_put;
|
||||
|
||||
@ -248,11 +252,12 @@ void bch_debug_exit(void)
|
||||
debugfs_remove_recursive(bcache_debug);
|
||||
}
|
||||
|
||||
int __init bch_debug_init(struct kobject *kobj)
|
||||
void __init bch_debug_init(struct kobject *kobj)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_DEBUG_FS))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* it is unnecessary to check return value of
|
||||
* debugfs_create_file(), we should not care
|
||||
* about this.
|
||||
*/
|
||||
bcache_debug = debugfs_create_dir("bcache", NULL);
|
||||
return IS_ERR_OR_NULL(bcache_debug);
|
||||
}
|
||||
|
@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
|
||||
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
|
||||
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
|
||||
free_fifo(&c->journal.pin);
|
||||
free_heap(&c->flush_btree);
|
||||
}
|
||||
|
||||
int bch_journal_alloc(struct cache_set *c)
|
||||
|
@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
|
||||
/*
|
||||
* The journalling code doesn't handle the case where the keys to insert
|
||||
* is bigger than an empty write: If we just return -ENOMEM here,
|
||||
* bio_insert() and bio_invalidate() will insert the keys created so far
|
||||
* bch_data_insert_keys() will insert the keys created so far
|
||||
* and finish the rest when the keylist is empty.
|
||||
*/
|
||||
if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
|
||||
@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
|
||||
static void bio_complete(struct search *s)
|
||||
{
|
||||
if (s->orig_bio) {
|
||||
generic_end_io_acct(s->d->disk->queue,
|
||||
bio_data_dir(s->orig_bio),
|
||||
generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
|
||||
&s->d->disk->part0, s->start_time);
|
||||
|
||||
trace_bcache_request_end(s->d, s->orig_bio);
|
||||
@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
|
||||
{
|
||||
struct search *s = container_of(cl, struct search, cl);
|
||||
|
||||
atomic_dec(&s->d->c->search_inflight);
|
||||
|
||||
if (s->iop.bio)
|
||||
bio_put(s->iop.bio);
|
||||
|
||||
@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
|
||||
|
||||
closure_init(&s->cl, NULL);
|
||||
do_bio_hook(s, bio, request_endio);
|
||||
atomic_inc(&d->c->search_inflight);
|
||||
|
||||
s->orig_bio = bio;
|
||||
s->cache_miss = NULL;
|
||||
@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
|
||||
bio->bi_end_io = ddip->bi_end_io;
|
||||
bio->bi_private = ddip->bi_private;
|
||||
|
||||
generic_end_io_acct(ddip->d->disk->queue,
|
||||
bio_data_dir(bio),
|
||||
generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
|
||||
&ddip->d->disk->part0, ddip->start_time);
|
||||
|
||||
if (bio->bi_status) {
|
||||
@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
|
||||
generic_make_request(bio);
|
||||
}
|
||||
|
||||
static void quit_max_writeback_rate(struct cache_set *c,
|
||||
struct cached_dev *this_dc)
|
||||
{
|
||||
int i;
|
||||
struct bcache_device *d;
|
||||
struct cached_dev *dc;
|
||||
|
||||
/*
|
||||
* mutex bch_register_lock may compete with other parallel requesters,
|
||||
* or attach/detach operations on other backing device. Waiting to
|
||||
* the mutex lock may increase I/O request latency for seconds or more.
|
||||
* To avoid such situation, if mutext_trylock() failed, only writeback
|
||||
* rate of current cached device is set to 1, and __update_write_back()
|
||||
* will decide writeback rate of other cached devices (remember now
|
||||
* c->idle_counter is 0 already).
|
||||
*/
|
||||
if (mutex_trylock(&bch_register_lock)) {
|
||||
for (i = 0; i < c->devices_max_used; i++) {
|
||||
if (!c->devices[i])
|
||||
continue;
|
||||
|
||||
if (UUID_FLASH_ONLY(&c->uuids[i]))
|
||||
continue;
|
||||
|
||||
d = c->devices[i];
|
||||
dc = container_of(d, struct cached_dev, disk);
|
||||
/*
|
||||
* set writeback rate to default minimum value,
|
||||
* then let update_writeback_rate() to decide the
|
||||
* upcoming rate.
|
||||
*/
|
||||
atomic_long_set(&dc->writeback_rate.rate, 1);
|
||||
}
|
||||
mutex_unlock(&bch_register_lock);
|
||||
} else
|
||||
atomic_long_set(&this_dc->writeback_rate.rate, 1);
|
||||
}
|
||||
|
||||
/* Cached devices - read & write stuff */
|
||||
|
||||
static blk_qc_t cached_dev_make_request(struct request_queue *q,
|
||||
@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
atomic_set(&dc->backing_idle, 0);
|
||||
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
|
||||
if (likely(d->c)) {
|
||||
if (atomic_read(&d->c->idle_counter))
|
||||
atomic_set(&d->c->idle_counter, 0);
|
||||
/*
|
||||
* If at_max_writeback_rate of cache set is true and new I/O
|
||||
* comes, quit max writeback rate of all cached devices
|
||||
* attached to this cache set, and set at_max_writeback_rate
|
||||
* to false.
|
||||
*/
|
||||
if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
|
||||
atomic_set(&d->c->at_max_writeback_rate, 0);
|
||||
quit_max_writeback_rate(d->c, dc);
|
||||
}
|
||||
}
|
||||
|
||||
generic_start_io_acct(q,
|
||||
bio_op(bio),
|
||||
bio_sectors(bio),
|
||||
&d->disk->part0);
|
||||
|
||||
bio_set_dev(bio, dc->bdev);
|
||||
bio->bi_iter.bi_sector += dc->sb.data_offset;
|
||||
@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
|
||||
struct search *s;
|
||||
struct closure *cl;
|
||||
struct bcache_device *d = bio->bi_disk->private_data;
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
|
||||
generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
|
||||
|
||||
s = search_alloc(bio, d);
|
||||
cl = &s->cl;
|
||||
@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
|
||||
flash_dev_nodata,
|
||||
bcache_wq);
|
||||
return BLK_QC_T_NONE;
|
||||
} else if (rw) {
|
||||
} else if (bio_data_dir(bio)) {
|
||||
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
|
||||
&KEY(d->id, bio->bi_iter.bi_sector, 0),
|
||||
&KEY(d->id, bio_end_sector(bio), 0));
|
||||
|
@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
goto err;
|
||||
}
|
||||
|
||||
sb->last_mount = get_seconds();
|
||||
sb->last_mount = (u32)ktime_get_real_seconds();
|
||||
err = NULL;
|
||||
|
||||
get_page(bh->b_page);
|
||||
@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d)
|
||||
{
|
||||
lockdep_assert_held(&bch_register_lock);
|
||||
|
||||
atomic_dec(&d->c->attached_dev_nr);
|
||||
|
||||
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
|
||||
struct uuid_entry *u = d->c->uuids + d->id;
|
||||
|
||||
SET_UUID_FLASH_ONLY(u, 0);
|
||||
memcpy(u->uuid, invalid_uuid, 16);
|
||||
u->invalidated = cpu_to_le32(get_seconds());
|
||||
u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
|
||||
bch_uuid_write(d->c);
|
||||
}
|
||||
|
||||
@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
|
||||
return idx;
|
||||
|
||||
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
|
||||
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
|
||||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
|
||||
ida_simple_remove(&bcache_device_idx, idx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
|
||||
goto err;
|
||||
|
||||
d->disk = alloc_disk(BCACHE_MINORS);
|
||||
if (!d->disk)
|
||||
goto err;
|
||||
|
||||
set_capacity(d->disk, sectors);
|
||||
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
|
||||
@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
|
||||
blk_queue_write_cache(q, true, true);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
ida_simple_remove(&bcache_device_idx, idx);
|
||||
return -ENOMEM;
|
||||
|
||||
}
|
||||
|
||||
/* Cached device */
|
||||
@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
|
||||
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
|
||||
uint8_t *set_uuid)
|
||||
{
|
||||
uint32_t rtime = cpu_to_le32(get_seconds());
|
||||
uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
|
||||
struct uuid_entry *u;
|
||||
struct cached_dev *exist_dc, *t;
|
||||
|
||||
@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
|
||||
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
|
||||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
|
||||
memcpy(u->uuid, invalid_uuid, 16);
|
||||
u->invalidated = cpu_to_le32(get_seconds());
|
||||
u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
|
||||
u = NULL;
|
||||
}
|
||||
|
||||
@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
|
||||
|
||||
bch_cached_dev_run(dc);
|
||||
bcache_device_link(&dc->disk, c, "bdev");
|
||||
atomic_inc(&c->attached_dev_nr);
|
||||
|
||||
/* Allow the writeback thread to proceed */
|
||||
up_write(&dc->writeback_lock);
|
||||
@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
|
||||
pr_info("registered backing device %s", dc->backing_dev_name);
|
||||
|
||||
list_add(&dc->list, &uncached_devices);
|
||||
/* attach to a matched cache set if it exists */
|
||||
list_for_each_entry(c, &bch_cache_sets, list)
|
||||
bch_cached_dev_attach(dc, c, NULL);
|
||||
|
||||
@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl)
|
||||
{
|
||||
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
|
||||
mutex_lock(&bch_register_lock);
|
||||
atomic_long_sub(bcache_dev_sectors_dirty(d),
|
||||
&d->c->flash_dev_dirty_sectors);
|
||||
bcache_device_free(d);
|
||||
mutex_unlock(&bch_register_lock);
|
||||
kobject_put(&d->kobj);
|
||||
@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
|
||||
|
||||
get_random_bytes(u->uuid, 16);
|
||||
memset(u->label, 0, 32);
|
||||
u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
|
||||
u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
|
||||
|
||||
SET_UUID_FLASH_ONLY(u, 1);
|
||||
u->sectors = size >> 9;
|
||||
@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
c->block_bits = ilog2(sb->block_size);
|
||||
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
|
||||
c->devices_max_used = 0;
|
||||
atomic_set(&c->attached_dev_nr, 0);
|
||||
c->btree_pages = bucket_pages(c);
|
||||
if (c->btree_pages > BTREE_MAX_PAGES)
|
||||
c->btree_pages = max_t(int, c->btree_pages / 4,
|
||||
@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c)
|
||||
goto err;
|
||||
|
||||
closure_sync(&cl);
|
||||
c->sb.last_mount = get_seconds();
|
||||
c->sb.last_mount = (u32)ktime_get_real_seconds();
|
||||
bcache_write_super(c);
|
||||
|
||||
list_for_each_entry_safe(dc, t, &uncached_devices, list)
|
||||
@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
|
||||
if (!try_module_get(THIS_MODULE))
|
||||
return -EBUSY;
|
||||
|
||||
if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
|
||||
!(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
|
||||
path = kstrndup(buffer, size, GFP_KERNEL);
|
||||
if (!path)
|
||||
goto err;
|
||||
|
||||
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
|
||||
if (!sb)
|
||||
goto err;
|
||||
|
||||
err = "failed to open device";
|
||||
@ -2324,13 +2341,21 @@ static int __init bcache_init(void)
|
||||
return bcache_major;
|
||||
}
|
||||
|
||||
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
|
||||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
|
||||
bch_request_init() ||
|
||||
bch_debug_init(bcache_kobj) || closure_debug_init() ||
|
||||
bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
|
||||
if (!bcache_wq)
|
||||
goto err;
|
||||
|
||||
bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
|
||||
if (!bcache_kobj)
|
||||
goto err;
|
||||
|
||||
if (bch_request_init() ||
|
||||
sysfs_create_files(bcache_kobj, files))
|
||||
goto err;
|
||||
|
||||
bch_debug_init(bcache_kobj);
|
||||
closure_debug_init();
|
||||
|
||||
return 0;
|
||||
err:
|
||||
bcache_exit();
|
||||
|
@ -149,6 +149,7 @@ SHOW(__bch_cached_dev)
|
||||
struct cached_dev *dc = container_of(kobj, struct cached_dev,
|
||||
disk.kobj);
|
||||
const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
|
||||
int wb = dc->writeback_running;
|
||||
|
||||
#define var(stat) (dc->stat)
|
||||
|
||||
@ -170,7 +171,8 @@ SHOW(__bch_cached_dev)
|
||||
var_printf(writeback_running, "%i");
|
||||
var_print(writeback_delay);
|
||||
var_print(writeback_percent);
|
||||
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
|
||||
sysfs_hprint(writeback_rate,
|
||||
wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
|
||||
sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
|
||||
sysfs_printf(io_error_limit, "%i", dc->error_limit);
|
||||
sysfs_printf(io_disable, "%i", dc->io_disable);
|
||||
@ -188,15 +190,22 @@ SHOW(__bch_cached_dev)
|
||||
char change[20];
|
||||
s64 next_io;
|
||||
|
||||
bch_hprint(rate, dc->writeback_rate.rate << 9);
|
||||
/*
|
||||
* Except for dirty and target, other values should
|
||||
* be 0 if writeback is not running.
|
||||
*/
|
||||
bch_hprint(rate,
|
||||
wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
|
||||
: 0);
|
||||
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
|
||||
bch_hprint(target, dc->writeback_rate_target << 9);
|
||||
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
|
||||
bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
|
||||
bch_hprint(change, dc->writeback_rate_change << 9);
|
||||
|
||||
next_io = div64_s64(dc->writeback_rate.next - local_clock(),
|
||||
NSEC_PER_MSEC);
|
||||
bch_hprint(proportional,
|
||||
wb ? dc->writeback_rate_proportional << 9 : 0);
|
||||
bch_hprint(integral,
|
||||
wb ? dc->writeback_rate_integral_scaled << 9 : 0);
|
||||
bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0);
|
||||
next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(),
|
||||
NSEC_PER_MSEC) : 0;
|
||||
|
||||
return sprintf(buf,
|
||||
"rate:\t\t%s/sec\n"
|
||||
@ -255,8 +264,19 @@ STORE(__cached_dev)
|
||||
|
||||
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
|
||||
|
||||
sysfs_strtoul_clamp(writeback_rate,
|
||||
dc->writeback_rate.rate, 1, INT_MAX);
|
||||
if (attr == &sysfs_writeback_rate) {
|
||||
ssize_t ret;
|
||||
long int v = atomic_long_read(&dc->writeback_rate.rate);
|
||||
|
||||
ret = strtoul_safe_clamp(buf, v, 1, INT_MAX);
|
||||
|
||||
if (!ret) {
|
||||
atomic_long_set(&dc->writeback_rate.rate, v);
|
||||
ret = size;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
sysfs_strtoul_clamp(writeback_rate_update_seconds,
|
||||
dc->writeback_rate_update_seconds,
|
||||
@ -338,7 +358,7 @@ STORE(__cached_dev)
|
||||
if (!v)
|
||||
return size;
|
||||
}
|
||||
|
||||
if (v == -ENOENT)
|
||||
pr_err("Can't attach %s: cache set not found", buf);
|
||||
return v;
|
||||
}
|
||||
|
@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
|
||||
{
|
||||
uint64_t now = local_clock();
|
||||
|
||||
d->next += div_u64(done * NSEC_PER_SEC, d->rate);
|
||||
d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
|
||||
|
||||
/* Bound the time. Don't let us fall further than 2 seconds behind
|
||||
* (this prevents unnecessary backlog that would make it impossible
|
||||
|
@ -442,7 +442,7 @@ struct bch_ratelimit {
|
||||
* Rate at which we want to do work, in units per second
|
||||
* The units here correspond to the units passed to bch_next_delay()
|
||||
*/
|
||||
uint32_t rate;
|
||||
atomic_long_t rate;
|
||||
};
|
||||
|
||||
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user