for-6.2/block-2022-12-08

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmOScsgQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpi5ID/9pLXFYOq1+uDjU0KO/MdjMjK8Ukr34lCnk
 WkajRLheE8JBKOFDE54XJk56sQSZHX9bTWqziar0h1fioh7FlQR/tVvzsERCm2M9
 2y9THJNJygC68wgybStyiKlshFjl7TD7Kv5N9Y3xP3mkQygT+D6o8fXZk5xQbYyH
 YdFSoq4rJVHxRL03yzQiReGGIYdOUEQQh8l1FiLwLlKa3lXAey1KuxWIzksVN0KK
 aZB4QhiBpOiPgDHUVisq2XtyQjpZ2byoCImPzgrcqk9Jo4esvm/e6esrg4xlsvII
 LKFFkTmbVqjUZtFjqakFHmfuzVor4nU5f+xb90ZHExuuODYckkxWp5rWhf9QwqqI
 0ik6WYgI1/5vnHnX8f2DYzOFQf9qa/rLgg0CshyUODlD6RfHa9vntqYvlIFkmOBd
 Q7KblIoK8YTzUS1M+v7X8JQ7gDR2KwygH37Da2KJS+vgvfIb8kJGr1ZORuhJuJJ7
 Bl69gaNkHTHrqufp7UI64YXfueeuNu2J9z3zwzGoxeaFaofF/phDn0/2gCQE1fQI
 XBhsMw+ETqI6B2SPHMnzYDu2DM1S8ZTOYQlaD4G3uqgWnAM1tG707395uAy5yu4n
 D5azU1fVG4UocoNIyPujpaoSRs2zWZycEFEeUQkhyDDww/j4hlHi6H33eOnk0zsr
 wxzFGfvHfw==
 =k/vv
 -----END PGP SIGNATURE-----

Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - NVMe pull requests via Christoph:
      - Support some passthrough commands without CAP_SYS_ADMIN (Kanchan
        Joshi)
      - Refactor PCIe probing and reset (Christoph Hellwig)
      - Various fabrics authentication fixes and improvements (Sagi
        Grimberg)
      - Avoid fallback to sequential scan due to transient issues (Uday
        Shankar)
      - Implement support for the DEAC bit in Write Zeroes (Christoph
        Hellwig)
      - Allow overriding the IEEE OUI and firmware revision in configfs
        for nvmet (Aleksandr Miloserdov)
      - Force reconnect when number of queue changes in nvmet (Daniel
        Wagner)
      - Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi
        Grimberg, Christoph Hellwig, Christophe JAILLET)
      - Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni)
      - Use the common tagset helpers in nvme-pci driver (Christoph
        Hellwig)
      - Cleanup the nvme-pci removal path (Christoph Hellwig)
      - Use kstrtobool() instead of strtobool (Christophe JAILLET)
      - Allow unprivileged passthrough of Identify Controller (Joel
        Granados)
      - Support io stats on the mpath device (Sagi Grimberg)
      - Minor nvmet cleanup (Sagi Grimberg)

 - MD pull requests via Song:
      - Code cleanups (Christoph)
      - Various fixes

 - Floppy pull request from Denis:
      - Fix a memory leak in the init error path (Yuan)

 - Series fixing some batch wakeup issues with sbitmap (Gabriel)

 - Removal of the pktcdvd driver that was deprecated more than 5 years
   ago, and subsequent removal of the devnode callback in struct
   block_device_operations as no users are now left (Greg)

 - Fix for partition read on an exclusively opened bdev (Jan)

 - Series of elevator API cleanups (Jinlong, Christoph)

 - Series of fixes and cleanups for blk-iocost (Kemeng)

 - Series of fixes and cleanups for blk-throttle (Kemeng)

 - Series adding concurrent support for sync queues in BFQ (Yu)

 - Series bringing drbd a bit closer to the out-of-tree maintained
   version (Christian, Joel, Lars, Philipp)

 - Misc drbd fixes (Wang)

 - blk-wbt fixes and tweaks for enable/disable (Yu)

 - Fixes for mq-deadline for zoned devices (Damien)

 - Add support for read-only and offline zones for null_blk
   (Shin'ichiro)

 - Series fixing the delayed holder tracking, as used by DM (Yu,
   Christoph)

 - Series enabling bio alloc caching for IRQ based IO (Pavel)

 - Series enabling userspace peer-to-peer DMA (Logan)

 - BFQ waker fixes (Khazhismel)

 - Series fixing elevator refcount issues (Christoph, Jinlong)

 - Series cleaning up references around queue destruction (Christoph)

 - Series doing quiesce by tagset, enabling cleanups in drivers
   (Christoph, Chao)

 - Series untangling the queue kobject and queue references (Christoph)

 - Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye,
   Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph)

* tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits)
  blktrace: Fix output non-blktrace event when blk_classic option enabled
  block: sed-opal: Don't include <linux/kernel.h>
  sed-opal: allow using IOC_OPAL_SAVE for locking too
  blk-cgroup: Fix typo in comment
  block: remove bio_set_op_attrs
  nvmet: don't open-code NVME_NS_ATTR_RO enumeration
  nvme-pci: use the tagset alloc/free helpers
  nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set
  nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers
  nvme: consolidate setting the tagset flags
  nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set
  block: bio_copy_data_iter
  nvme-pci: split out a nvme_pci_ctrl_is_dead helper
  nvme-pci: return early on ctrl state mismatch in nvme_reset_work
  nvme-pci: rename nvme_disable_io_queues
  nvme-pci: cleanup nvme_suspend_queue
  nvme-pci: remove nvme_pci_disable
  nvme-pci: remove nvme_disable_admin_queue
  nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl
  nvme: use nvme_wait_ready in nvme_shutdown_ctrl
  ...
This commit is contained in:
Linus Torvalds 2022-12-13 10:43:59 -08:00
commit ce8a79d560
144 changed files with 3235 additions and 5913 deletions

View File

@ -1,18 +0,0 @@
What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
Date: Oct. 2006
KernelVersion: 2.6.20
Contact: Thomas Maier <balagi@justmail.de>
Description:
The pktcdvd module (packet writing driver) creates
these files in debugfs:
/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
==== ====== ====================================
info 0444 Lots of driver statistics and infos.
==== ====== ====================================
Example::
cat /sys/kernel/debug/pktcdvd/pktcdvd0/info

View File

@ -407,6 +407,16 @@ Description:
file contains a '1' if the memory has been published for
use outside the driver that owns the device.
What: /sys/bus/pci/devices/.../p2pmem/allocate
Date: August 2022
Contact: Logan Gunthorpe <logang@deltatee.com>
Description:
This file allows mapping p2pmem into userspace. For each
mmap() call on this file, the kernel will allocate a chunk
of Peer-to-Peer memory for use in Peer-to-Peer transactions.
This memory can be used in O_DIRECT calls to NVMe backed
files for Peer-to-Peer copies.
What: /sys/bus/pci/devices/.../link/clkpm
/sys/bus/pci/devices/.../link/l0s_aspm
/sys/bus/pci/devices/.../link/l1_aspm

View File

@ -1,97 +0,0 @@
sysfs interface
---------------
The pktcdvd module (packet writing driver) creates the following files in the
sysfs: (<devid> is in the format major:minor)
What: /sys/class/pktcdvd/add
What: /sys/class/pktcdvd/remove
What: /sys/class/pktcdvd/device_map
Date: Oct. 2006
KernelVersion: 2.6.20
Contact: Thomas Maier <balagi@justmail.de>
Description:
========== ==============================================
add (WO) Write a block device id (major:minor) to
create a new pktcdvd device and map it to the
block device.
remove (WO) Write the pktcdvd device id (major:minor)
to remove the pktcdvd device.
device_map (RO) Shows the device mapping in format:
pktcdvd[0-7] <pktdevid> <blkdevid>
========== ==============================================
What: /sys/class/pktcdvd/pktcdvd[0-7]/dev
What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent
Date: Oct. 2006
KernelVersion: 2.6.20
Contact: Thomas Maier <balagi@justmail.de>
Description:
dev: (RO) Device id
uevent: (WO) To send a uevent
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset
Date: Oct. 2006
KernelVersion: 2.6.20
Contact: Thomas Maier <balagi@justmail.de>
Description:
packets_started: (RO) Number of started packets.
packets_finished: (RO) Number of finished packets.
kb_written: (RO) kBytes written.
kb_read: (RO) kBytes read.
kb_read_gather: (RO) kBytes read to fill write packets.
reset: (WO) Write any value to it to reset
pktcdvd device statistic values, like
bytes read/written.
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on
Date: Oct. 2006
KernelVersion: 2.6.20
Contact: Thomas Maier <balagi@justmail.de>
Description:
============== ================================================
size (RO) Contains the size of the bio write queue.
congestion_off (RW) If bio write queue size is below this mark,
accept new bio requests from the block layer.
congestion_on (RW) If bio write queue size is higher as this
mark, do no longer accept bio write requests
from the block layer and wait till the pktcdvd
device has processed enough bio's so that bio
write queue size is below congestion off mark.
A value of <= 0 disables congestion control.
============== ================================================
Example:
--------
To use the pktcdvd sysfs interface directly, you can do::
# create a new pktcdvd device mapped to /dev/hdc
echo "22:0" >/sys/class/pktcdvd/add
cat /sys/class/pktcdvd/device_map
# assuming device pktcdvd0 was created, look at stat's
cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
# print the device id of the mapped block device
fgrep pktcdvd0 /sys/class/pktcdvd/device_map
# remove device, using pktcdvd0 device id 253:0
echo "253:0" >/sys/class/pktcdvd/remove

View File

@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
of inline encryption using the kernel crypto API. blk-crypto-fallback is built
into the block layer, so it works on any block device without any special setup.
Essentially, when a bio with an encryption context is submitted to a
request_queue that doesn't support that encryption context, the block layer will
block_device that doesn't support that encryption context, the block layer will
handle en/decryption of the bio using blk-crypto-fallback.
For encryption, the data cannot be encrypted in-place, as callers usually rely
@ -187,7 +187,7 @@ API presented to users of the block layer
``blk_crypto_config_supported()`` allows users to check ahead of time whether
inline encryption with particular crypto settings will work on a particular
request_queue -- either via hardware or via blk-crypto-fallback. This function
block_device -- either via hardware or via blk-crypto-fallback. This function
takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
the actual bytes of the key and instead just contains the algorithm, data unit
size, etc. This function can be useful if blk-crypto-fallback is disabled.
@ -195,7 +195,7 @@ size, etc. This function can be useful if blk-crypto-fallback is disabled.
``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
Users must call ``blk_crypto_start_using_key()`` before actually starting to use
a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
was called earlier). This is needed to initialize blk-crypto-fallback if it
will be needed. This must not be called from the data path, as this may have to
allocate resources, which may deadlock in that case.
@ -207,7 +207,7 @@ for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
later, as that happens automatically when the bio is freed or reset.
Finally, when done using inline encryption with a blk_crypto_key on a
request_queue, users must call ``blk_crypto_evict_key()``. This ensures that
block_device, users must call ``blk_crypto_evict_key()``. This ensures that
the key is evicted from all keyslots it may be programmed into and unlinked from
any kernel data structures it may be linked into.
@ -221,9 +221,9 @@ as follows:
5. ``blk_crypto_evict_key()`` (after all I/O has completed)
6. Zeroize the blk_crypto_key (this has no dedicated function)
If a blk_crypto_key is being used on multiple request_queues, then
If a blk_crypto_key is being used on multiple block_devices, then
``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
and ``blk_crypto_evict_key()`` must be called on each request_queue.
and ``blk_crypto_evict_key()`` must be called on each block_device.
API presented to device drivers
===============================

View File

@ -16430,13 +16430,6 @@ S: Supported
F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
F: drivers/input/keyboard/pinephone-keyboard.c
PKTCDVD DRIVER
M: linux-block@vger.kernel.org
S: Orphan
F: drivers/block/pktcdvd.c
F: include/linux/pktcdvd.h
F: include/uapi/linux/pktcdvd.h
PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
M: Tomasz Duszynski <tduszyns@gmail.com>
S: Maintained

View File

@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev)
EXPORT_SYMBOL(fsync_bdev);
/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
* freeze_bdev - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
@ -268,7 +268,7 @@ int freeze_bdev(struct block_device *bdev)
EXPORT_SYMBOL(freeze_bdev);
/**
* thaw_bdev -- unlock filesystem
* thaw_bdev - unlock filesystem
* @bdev: blockdevice to unlock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().

View File

@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
{
blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
if (!(bfqq == bfqg->bfqd->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
*/
bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
bfqg->num_queues_with_pending_reqs = 0;
bfqg->online = true;
bfqg->rq_pos_tree = RB_ROOT;
}
@ -645,6 +646,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_entity *entity = &bfqq->entity;
struct bfq_group *old_parent = bfqq_group(bfqq);
bool has_pending_reqs = false;
/*
* No point to move bfqq to the same group, which can happen when
@ -665,6 +667,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfqq->ref++;
if (entity->in_groups_with_pending_reqs) {
has_pending_reqs = true;
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
}
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@ -692,6 +699,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* pin down bfqg and its associated blkg */
bfqg_and_blkg_get(bfqg);
if (has_pending_reqs)
bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
if (bfq_bfqq_busy(bfqq)) {
if (unlikely(!bfqd->nonrot_with_queueing))
bfq_pos_tree_add_move(bfqd, bfqq);

View File

@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
* 2) all active queues belong to the same I/O-priority class,
* 3) there are no active groups.
* 3) there is at most one active group.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
return varied_queue_weights || multiple_classes_busy
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|| bfqd->num_groups_with_pending_reqs > 0
|| bfqd->num_groups_with_pending_reqs > 1
#endif
;
}
@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct rb_root_cached *root)
void bfq_weights_tree_add(struct bfq_queue *bfqq)
{
struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree;
struct bfq_entity *entity = &bfqq->entity;
struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
bool leftmost = true;
@ -944,13 +944,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct rb_root_cached *root)
void bfq_weights_tree_remove(struct bfq_queue *bfqq)
{
struct rb_root_cached *root;
if (!bfqq->weight_counter)
return;
root = &bfqq->bfqd->queue_weights_tree;
bfqq->weight_counter->num_active--;
if (bfqq->weight_counter->num_active > 0)
goto reset_entity_pointer;
@ -963,59 +964,6 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
bfq_put_queue(bfqq);
}
/*
* Invoke __bfq_weights_tree_remove on bfqq and decrement the number
* of active groups for each queue's inactive parent entity.
*/
void bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_entity *entity = bfqq->entity.parent;
for_each_entity(entity) {
struct bfq_sched_data *sd = entity->my_sched_data;
if (sd->next_in_service || sd->in_service_entity) {
/*
* entity is still active, because either
* next_in_service or in_service_entity is not
* NULL (see the comments on the definition of
* next_in_service for details on why
* in_service_entity must be checked too).
*
* As a consequence, its parent entities are
* active as well, and thus this loop must
* stop here.
*/
break;
}
/*
* The decrement of num_groups_with_pending_reqs is
* not performed immediately upon the deactivation of
* entity, but it is delayed to when it also happens
* that the first leaf descendant bfqq of entity gets
* all its pending requests completed. The following
* instructions perform this delayed decrement, if
* needed. See the comments on
* num_groups_with_pending_reqs for details.
*/
if (entity->in_groups_with_pending_reqs) {
entity->in_groups_with_pending_reqs = false;
bfqd->num_groups_with_pending_reqs--;
}
}
/*
* Next function is invoked last, because it causes bfqq to be
* freed if the following holds: bfqq is not in service and
* has no dispatched request. DO NOT use bfqq after the next
* function invocation.
*/
__bfq_weights_tree_remove(bfqd, bfqq,
&bfqd->queue_weights_tree);
}
/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
@ -2135,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC)
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq ||
bfqq == &bfqd->oom_bfqq)
return;
/*
@ -2373,22 +2323,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
return 0;
}
#if 0 /* Still not clear if we can do without next two functions */
static void bfq_activate_request(struct request_queue *q, struct request *rq)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
bfqd->rq_in_driver++;
}
static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
bfqd->rq_in_driver--;
}
#endif
static void bfq_remove_request(struct request_queue *q,
struct request *rq)
{
@ -6261,7 +6195,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
bfq_weights_tree_remove(bfqd, bfqq);
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
@ -6784,6 +6719,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
if (unlikely(bfqq == &bfqd->oom_bfqq))
bfqq_already_existing = true;
} else
bfqq_already_existing = true;
if (!bfqq_already_existing) {
bfqq->waker_bfqq = old_bfqq->waker_bfqq;
bfqq->tentative_waker_bfqq = NULL;
@ -6797,8 +6738,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (bfqq->waker_bfqq)
hlist_add_head(&bfqq->woken_list_node,
&bfqq->waker_bfqq->woken_list);
} else
bfqq_already_existing = true;
}
}
}
@ -7045,6 +6985,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
wbt_enable_default(bfqd->queue);
kfree(bfqd);
@ -7190,6 +7131,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
wbt_disable_default(q);
blk_stat_enable_accounting(q);

View File

@ -492,27 +492,27 @@ struct bfq_data {
struct rb_root_cached queue_weights_tree;
/*
* Number of groups with at least one descendant process that
* Number of groups with at least one process that
* has at least one request waiting for completion. Note that
* this accounts for also requests already dispatched, but not
* yet completed. Therefore this number of groups may differ
* (be larger) than the number of active groups, as a group is
* considered active only if its corresponding entity has
* descendant queues with at least one request queued. This
* queues with at least one request queued. This
* number is used to decide whether a scenario is symmetric.
* For a detailed explanation see comments on the computation
* of the variable asymmetric_scenario in the function
* bfq_better_to_idle().
*
* However, it is hard to compute this number exactly, for
* groups with multiple descendant processes. Consider a group
* that is inactive, i.e., that has no descendant process with
* groups with multiple processes. Consider a group
* that is inactive, i.e., that has no process with
* pending I/O inside BFQ queues. Then suppose that
* num_groups_with_pending_reqs is still accounting for this
* group, because the group has descendant processes with some
* group, because the group has processes with some
* I/O request still in flight. num_groups_with_pending_reqs
* should be decremented when the in-flight request of the
* last descendant process is finally completed (assuming that
* last process is finally completed (assuming that
* nothing else has changed for the group in the meantime, in
* terms of composition of the group and active/inactive state of child
* groups and processes). To accomplish this, an additional
@ -521,7 +521,7 @@ struct bfq_data {
* we resort to the following tradeoff between simplicity and
* accuracy: for an inactive group that is still counted in
* num_groups_with_pending_reqs, we decrement
* num_groups_with_pending_reqs when the first descendant
* num_groups_with_pending_reqs when the first
* process of the group remains with no request waiting for
* completion.
*
@ -529,12 +529,12 @@ struct bfq_data {
* carefulness: to avoid multiple decrements, we flag a group,
* more precisely an entity representing a group, as still
* counted in num_groups_with_pending_reqs when it becomes
* inactive. Then, when the first descendant queue of the
* inactive. Then, when the first queue of the
* entity remains with no request waiting for completion,
* num_groups_with_pending_reqs is decremented, and this flag
* is reset. After this flag is reset for the entity,
* num_groups_with_pending_reqs won't be decremented any
* longer in case a new descendant queue of the entity remains
* longer in case a new queue of the entity remains
* with no request waiting for completion.
*/
unsigned int num_groups_with_pending_reqs;
@ -931,7 +931,7 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
void *bfqd;
struct bfq_data *bfqd;
struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
struct bfq_queue *async_idle_bfqq;
@ -939,6 +939,7 @@ struct bfq_group {
struct bfq_entity *my_entity;
int active_entities;
int num_queues_with_pending_reqs;
struct rb_root rq_pos_tree;
@ -968,13 +969,8 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct rb_root_cached *root);
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct rb_root_cached *root);
void bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_queue *bfqq);
void bfq_weights_tree_add(struct bfq_queue *bfqq);
void bfq_weights_tree_remove(struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
@ -1078,6 +1074,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */

View File

@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return false;
}
static void bfq_inc_active_entities(struct bfq_entity *entity)
{
struct bfq_sched_data *sd = entity->sched_data;
struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
if (bfqg != bfqg->bfqd->root_group)
bfqg->active_entities++;
}
static void bfq_dec_active_entities(struct bfq_entity *entity)
{
struct bfq_sched_data *sd = entity->sched_data;
struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
if (bfqg != bfqg->bfqd->root_group)
bfqg->active_entities--;
}
#else /* CONFIG_BFQ_GROUP_IOSCHED */
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return true;
}
static void bfq_inc_active_entities(struct bfq_entity *entity)
{
}
static void bfq_dec_active_entities(struct bfq_entity *entity)
{
}
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
/*
@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
#endif
bfq_insert(&st->active, entity);
@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_update_active_tree(node);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (bfqg != bfqd->root_group)
bfqg->active_entities++;
#endif
bfq_inc_active_entities(entity);
}
/**
@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_sched_data *sd = NULL;
struct bfq_group *bfqg = NULL;
struct bfq_data *bfqd = NULL;
#endif
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
if (node)
bfq_update_active_tree(node);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
sd = entity->sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
bfqd = (struct bfq_data *)bfqg->bfqd;
#endif
if (bfqq)
list_del(&bfqq->bfqq_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (bfqg != bfqd->root_group)
bfqg->active_entities--;
#endif
bfq_dec_active_entities(entity);
}
/**
@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (entity->prio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned int prev_weight, new_weight;
struct bfq_data *bfqd = NULL;
struct rb_root_cached *root;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_sched_data *sd;
struct bfq_group *bfqg;
#endif
if (bfqq)
bfqd = bfqq->bfqd;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
else {
sd = entity->my_sched_data;
bfqg = container_of(sd, struct bfq_group, sched_data);
bfqd = (struct bfq_data *)bfqg->bfqd;
}
#endif
/* Matches the smp_wmb() in bfq_group_set_weight. */
smp_rmb();
@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
* queue, remove the entity from its old weight counter (if
* there is a counter associated with the entity).
*/
if (prev_weight != new_weight && bfqq) {
root = &bfqd->queue_weights_tree;
__bfq_weights_tree_remove(bfqd, bfqq, root);
}
if (prev_weight != new_weight && bfqq)
bfq_weights_tree_remove(bfqq);
entity->weight = new_weight;
/*
* Add the entity, if it is not a weight-raised queue,
* to the counter associated with its new weight.
*/
if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
/* If we get here, root has been initialized. */
bfq_weights_tree_add(bfqd, bfqq, root);
}
if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1)
bfq_weights_tree_add(bfqq);
new_st->wsum += entity->weight;
@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st_or_in_serv = true;
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
struct bfq_group *bfqg =
container_of(entity, struct bfq_group, entity);
struct bfq_data *bfqd = bfqg->bfqd;
if (!entity->in_groups_with_pending_reqs) {
entity->in_groups_with_pending_reqs = true;
bfqd->num_groups_with_pending_reqs++;
}
}
#endif
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity)
}
static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
struct bfq_sched_data *sd,
bool non_blocking_wait_rq)
{
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
if (sd->in_service_entity == entity || entity->tree == &st->active)
if (entity->sched_data->in_service_entity == entity ||
entity->tree == &st->active)
/*
* in service or already queued on the active tree,
* requeue or reposition
@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
bool non_blocking_wait_rq,
bool requeue, bool expiration)
{
struct bfq_sched_data *sd;
for_each_entity(entity) {
sd = entity->sched_data;
__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
if (!bfq_update_next_in_service(sd, entity, expiration) &&
!requeue)
__bfq_activate_requeue_entity(entity, non_blocking_wait_rq);
if (!bfq_update_next_in_service(entity->sched_data, entity,
expiration) && !requeue)
break;
}
}
@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq == bfqd->in_service_queue, expiration);
}
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
if (!entity->in_groups_with_pending_reqs) {
entity->in_groups_with_pending_reqs = true;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++))
bfqq->bfqd->num_groups_with_pending_reqs++;
#endif
}
}
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
if (entity->in_groups_with_pending_reqs) {
entity->in_groups_with_pending_reqs = false;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs))
bfqq->bfqd->num_groups_with_pending_reqs--;
#endif
}
}
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree. As a special case, it can be invoked during an
@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, bfqq);
if (!bfqq->dispatched) {
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
/*
* Next function is invoked last, because it causes bfqq to be
* freed. DO NOT use bfqq after the next function invocation.
*/
bfq_weights_tree_remove(bfqq);
}
}
/*
@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
if (!bfqq->dispatched)
if (!bfqq->dispatched) {
bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
if (bfqq->wr_coeff == 1)
bfq_weights_tree_add(bfqd, bfqq,
&bfqd->queue_weights_tree);
bfq_weights_tree_add(bfqq);
}
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;

View File

@ -25,9 +25,15 @@
#include "blk-rq-qos.h"
#include "blk-cgroup.h"
#define ALLOC_CACHE_THRESHOLD 16
#define ALLOC_CACHE_SLACK 64
#define ALLOC_CACHE_MAX 256
struct bio_alloc_cache {
struct bio *free_list;
struct bio *free_list_irq;
unsigned int nr;
unsigned int nr_irq;
};
static struct biovec_slab {
@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}
static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
{
unsigned long flags;
/* cache->free_list must be empty */
if (WARN_ON_ONCE(cache->free_list))
return;
local_irq_save(flags);
cache->free_list = cache->free_list_irq;
cache->free_list_irq = NULL;
cache->nr += cache->nr_irq;
cache->nr_irq = 0;
local_irq_restore(flags);
}
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache = per_cpu_ptr(bs->cache, get_cpu());
if (!cache->free_list) {
put_cpu();
return NULL;
if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
bio_alloc_irq_cache_splice(cache);
if (!cache->free_list) {
put_cpu();
return NULL;
}
}
bio = cache->free_list;
cache->free_list = bio->bi_next;
@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
* If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process
* context, not hard/soft IRQ.
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
@ -526,6 +549,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
}
if (unlikely(!p))
return NULL;
if (!mempool_is_saturated(&bs->bio_pool))
opf &= ~REQ_ALLOC_CACHE;
bio = p + bs->front_pad;
if (nr_vecs > BIO_INLINE_VECS) {
@ -676,11 +701,8 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
#define ALLOC_CACHE_MAX 512
#define ALLOC_CACHE_SLACK 64
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int nr)
static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int nr)
{
unsigned int i = 0;
struct bio *bio;
@ -692,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
if (++i == nr)
break;
}
return i;
}
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int nr)
{
nr -= __bio_alloc_cache_prune(cache, nr);
if (!READ_ONCE(cache->free_list)) {
bio_alloc_irq_cache_splice(cache);
__bio_alloc_cache_prune(cache, nr);
}
}
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
@ -725,6 +758,35 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
bs->cache = NULL;
}
static inline void bio_put_percpu_cache(struct bio *bio)
{
struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
put_cpu();
bio_free(bio);
return;
}
bio_uninit(bio);
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
bio->bi_next = cache->free_list;
cache->free_list = bio;
cache->nr++;
} else {
unsigned long flags;
local_irq_save(flags);
bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio;
cache->nr_irq++;
local_irq_restore(flags);
}
put_cpu();
}
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@ -740,20 +802,10 @@ void bio_put(struct bio *bio)
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
struct bio_alloc_cache *cache;
bio_uninit(bio);
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
bio->bi_next = cache->free_list;
cache->free_list = bio;
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
put_cpu();
} else {
if (bio->bi_opf & REQ_ALLOC_CACHE)
bio_put_percpu_cache(bio);
else
bio_free(bio);
}
}
EXPORT_SYMBOL(bio_put);
@ -863,6 +915,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (*same_page)
@ -1195,6 +1249,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
unsigned int gup_flags = 0;
ssize_t size, left;
unsigned len, i = 0;
size_t offset, trim;
@ -1208,6 +1263,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
gup_flags |= FOLL_PCI_P2PDMA;
/*
* Each segment in the iov is required to be a block size multiple.
* However, we may not be able to get the entire segment if it spans
@ -1215,8 +1273,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
*/
size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
nr_pages, &offset);
size = iov_iter_get_pages(iter, pages,
UINT_MAX - bio->bi_iter.bi_size,
nr_pages, &offset, gup_flags);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
@ -1342,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
while (src_iter->bi_size && dst_iter->bi_size) {
struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
void *src_buf = bvec_kmap_local(&src_bv);
void *dst_buf = bvec_kmap_local(&dst_bv);
memcpy(dst_buf, src_buf, bytes);
kunmap_local(dst_buf);
kunmap_local(src_buf);
bio_advance_iter_single(src, src_iter, bytes);
bio_advance_iter_single(dst, dst_iter, bytes);
}
}
EXPORT_SYMBOL(bio_copy_data_iter);
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
@ -1376,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
struct bvec_iter src_iter = src->bi_iter;
struct bvec_iter dst_iter = dst->bi_iter;
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
while (src_iter.bi_size && dst_iter.bi_size) {
struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
void *src_buf = bvec_kmap_local(&src_bv);
void *dst_buf = bvec_kmap_local(&dst_bv);
memcpy(dst_buf, src_buf, bytes);
kunmap_local(dst_buf);
kunmap_local(src_buf);
bio_advance_iter_single(src, &src_iter, bytes);
bio_advance_iter_single(dst, &dst_iter, bytes);
}
}
EXPORT_SYMBOL(bio_copy_data);

View File

@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
#define BLKG_DESTROY_BATCH_SIZE 64
/*
* Lockless lists for tracking IO stats update
*
* New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
* There are multiple blkg's (one for each block device) attached to each
* blkcg. The rstat code keeps track of which cpu has IO stats updated,
* but it doesn't know which blkg has the updated stats. If there are many
* block devices in a system, the cost of iterating all the blkg's to flush
* out the IO stats can be high. To reduce such overhead, a set of percpu
* lockless lists (lhead) per blkcg are used to track the set of recently
* updated iostat_cpu's since the last flush. An iostat_cpu will be put
* onto the lockless list on the update side [blk_cgroup_bio_start()] if
* not there yet and then removed when being flushed [blkcg_rstat_flush()].
* References to blkg are gotten and then put back in the process to
* protect against blkg removal.
*
* Return: 0 if successful or -ENOMEM if allocation fails.
*/
static int init_blkcg_llists(struct blkcg *blkcg)
{
int cpu;
blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
if (!blkcg->lhead)
return -ENOMEM;
for_each_possible_cpu(cpu)
init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
return 0;
}
/**
* blkcg_css - find the current css
*
@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
blkg->blkcg = blkcg;
u64_stats_init(&blkg->iostat.sync);
for_each_possible_cpu(cpu)
for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
}
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@ -577,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
* @pd: policy private data of interest
* @v: value to print
*
* Print @v to @sf for the device assocaited with @pd.
* Print @v to @sf for the device associated with @pd.
*/
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
@ -765,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
* @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
* @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
*
* Finish up after per-blkg config update. This function must be paired
* with blkg_conf_prep().
@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(css->cgroup))
@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
lnode = llist_del_all(lhead);
if (!lnode)
goto out;
/*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur;
unsigned int seq;
WRITE_ONCE(bisc->lqueued, false);
/* fetch the current per-cpu values */
do {
seq = u64_stats_fetch_begin(&bisc->sync);
@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
percpu_ref_put(&blkg->refcnt);
}
out:
rcu_read_unlock();
}
@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&blkcg_pol_mutex);
free_percpu(blkcg->lhead);
kfree(blkcg);
}
@ -1139,7 +1186,6 @@ static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct blkcg *blkcg;
struct cgroup_subsys_state *ret;
int i;
mutex_lock(&blkcg_pol_mutex);
@ -1148,12 +1194,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg = &blkcg_root;
} else {
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg) {
ret = ERR_PTR(-ENOMEM);
if (!blkcg)
goto unlock;
}
}
if (init_blkcg_llists(blkcg))
goto free_blkcg;
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@ -1168,10 +1215,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
continue;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
if (!cpd) {
ret = ERR_PTR(-ENOMEM);
if (!cpd)
goto free_pd_blkcg;
}
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
@ -1195,12 +1241,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
free_percpu(blkcg->lhead);
free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
mutex_unlock(&blkcg_pol_mutex);
return ret;
return ERR_PTR(-ENOMEM);
}
static int blkcg_css_online(struct cgroup_subsys_state *css)
@ -1784,7 +1831,7 @@ void blkcg_maybe_throttle_current(void)
/**
* blkcg_schedule_throttle - this task needs to check for throttling
* @gendisk: disk to throttle
* @disk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI
*
* This is called by the IO controller when we know there's delay accumulated
@ -1943,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio)
{
struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
unsigned long flags;
@ -1961,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
/*
* If the iostat_cpu isn't in a lockless list, put it into the
* list to indicate that a stat update is pending.
*/
if (!READ_ONCE(bis->lqueued)) {
struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}

View File

@ -18,6 +18,7 @@
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
struct blkcg_gq;
struct blkg_policy_data;
@ -43,6 +44,9 @@ struct blkg_iostat {
struct blkg_iostat_set {
struct u64_stats_sync sync;
struct blkcg_gq *blkg;
struct llist_node lnode;
int lqueued; /* queued in llist */
struct blkg_iostat cur;
struct blkg_iostat last;
};
@ -97,6 +101,12 @@ struct blkcg {
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
/*
* List of updated percpu blkg_iostat_set's since the last flush.
*/
struct llist_head __percpu *lhead;
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif

View File

@ -59,13 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
DEFINE_IDA(blk_queue_ida);
static DEFINE_IDA(blk_queue_ida);
/*
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
struct kmem_cache *blk_requestq_srcu_cachep;
static struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
@ -253,19 +252,44 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
kmem_cache_free(blk_requestq_cachep,
container_of(rcu_head, struct request_queue, rcu_head));
}
static void blk_free_queue(struct request_queue *q)
{
percpu_ref_exit(&q->q_usage_counter);
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
if (queue_is_mq(q))
blk_mq_release(q);
ida_free(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
/**
* blk_put_queue - decrement the request_queue refcount
* @q: the request_queue structure to decrement the refcount for
*
* Decrements the refcount of the request_queue kobject. When this reaches 0
* we'll have blk_release_queue() called.
* Decrements the refcount of the request_queue and free it when the refcount
* reaches 0.
*
* Context: Any context, but the last reference must not be dropped from
* atomic context.
* Context: Can sleep.
*/
void blk_put_queue(struct request_queue *q)
{
kobject_put(&q->kobj);
might_sleep();
if (refcount_dec_and_test(&q->refs))
blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);
@ -373,26 +397,20 @@ static void blk_timeout_work(struct work_struct *work)
{
}
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
GFP_KERNEL | __GFP_ZERO, node_id);
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
node_id);
if (!q)
return NULL;
if (alloc_srcu) {
blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
if (init_srcu_struct(q->srcu) != 0)
goto fail_q;
}
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
goto fail_srcu;
goto fail_q;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
@ -406,8 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
kobject_init(&q->kobj, &blk_queue_ktype);
refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
@ -434,11 +451,8 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
blk_free_queue_stats(q->stats);
fail_id:
ida_free(&blk_queue_ida, q->id);
fail_srcu:
if (alloc_srcu)
cleanup_srcu_struct(q->srcu);
fail_q:
kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
@ -454,7 +468,7 @@ bool blk_get_queue(struct request_queue *q)
{
if (unlikely(blk_queue_dying(q)))
return false;
kobject_get(&q->kobj);
refcount_inc(&q->refs);
return true;
}
EXPORT_SYMBOL(blk_get_queue);
@ -944,18 +958,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev,
}
EXPORT_SYMBOL(bdev_start_io_acct);
/**
* bio_start_io_acct_time - start I/O accounting for bio based drivers
* @bio: bio to start account for
* @start_time: start time that should be passed back to bio_end_io_acct().
*/
void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
{
bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
bio_op(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
@ -1183,9 +1185,6 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
__alignof__(struct request_queue)) !=
sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@ -1196,10 +1195,6 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
sizeof(struct request_queue) +
sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;

View File

@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
int blk_crypto_sysfs_register(struct request_queue *q);
int blk_crypto_sysfs_register(struct gendisk *disk);
void blk_crypto_sysfs_unregister(struct request_queue *q);
void blk_crypto_sysfs_unregister(struct gendisk *disk);
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@ -65,14 +65,28 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return rq->crypt_ctx;
}
blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
struct blk_crypto_keyslot **slot_ptr);
void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key);
bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
const struct blk_crypto_config *cfg);
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
static inline int blk_crypto_sysfs_register(struct request_queue *q)
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
{
return 0;
}
static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
}
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)

View File

@ -32,6 +32,7 @@
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include "blk-crypto-internal.h"
struct blk_crypto_keyslot {
atomic_t slot_refs;

View File

@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = {
* If the request_queue has a blk_crypto_profile, create the "crypto"
* subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
*/
int blk_crypto_sysfs_register(struct request_queue *q)
int blk_crypto_sysfs_register(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blk_crypto_kobj *obj;
int err;
@ -139,8 +140,8 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return -ENOMEM;
obj->profile = q->crypto_profile;
err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
"crypto");
err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype,
&disk->queue_kobj, "crypto");
if (err) {
kobject_put(&obj->kobj);
return err;
@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return 0;
}
void blk_crypto_sysfs_unregister(struct request_queue *q)
void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
kobject_put(q->crypto_kobject);
kobject_put(disk->queue->crypto_kobject);
}
static int __init blk_crypto_sysfs_init(void)

View File

@ -273,7 +273,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
@ -290,10 +289,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
if (blk_crypto_config_supported_natively(bio->bi_bdev,
&bc_key->crypto_cfg))
return true;
if (blk_crypto_fallback_bio_prep(bio_ptr))
return true;
fail:
@ -358,22 +356,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return 0;
}
bool blk_crypto_config_supported_natively(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
cfg);
}
/*
* Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
* request queue it's submitted to supports inline crypto, or the
* block_device it's submitted to supports inline crypto, or the
* blk-crypto-fallback is enabled and supports the cfg).
*/
bool blk_crypto_config_supported(struct request_queue *q,
bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
__blk_crypto_cfg_supported(q->crypto_profile, cfg);
blk_crypto_config_supported_natively(bdev, cfg);
}
/**
* blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
* @bdev: block device to operate on
* @key: A key to use on the device
* @q: the request queue for the device
*
* Upper layers must call this function to ensure that either the hardware
* supports the key's crypto settings, or the crypto API fallback has transforms
@ -385,10 +390,10 @@ bool blk_crypto_config_supported(struct request_queue *q,
* blk-crypto-fallback is either disabled or the needed algorithm
* is disabled in the crypto API; or another -errno code.
*/
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
struct request_queue *q)
int blk_crypto_start_using_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@ -396,7 +401,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
/**
* blk_crypto_evict_key() - Evict a key from any inline encryption hardware
* it may have been programmed into
* @q: The request queue who's associated inline encryption hardware this key
* @bdev: The block_device who's associated inline encryption hardware this key
* might have been programmed into
* @key: The key to evict
*
@ -406,14 +411,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
*
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
int blk_crypto_evict_key(struct request_queue *q,
int blk_crypto_evict_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
struct request_queue *q = bdev_get_queue(bdev);
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return __blk_crypto_evict_key(q->crypto_profile, key);
/*
* If the request_queue didn't support the key, then blk-crypto-fallback
* If the block_device didn't support the key, then blk-crypto-fallback
* may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);

View File

@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
*/
WARN_ON(iars->sysfs_registered);
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
&q->kobj, "%s", "independent_access_ranges");
&disk->queue_kobj, "%s",
"independent_access_ranges");
if (ret) {
disk->ia_ranges = NULL;
kobject_put(&iars->kobj);

View File

@ -111,7 +111,7 @@
* busy signal.
*
* As devices can have deep queues and be unfair in how the queued commands
* are executed, soley depending on rq wait may not result in satisfactory
* are executed, solely depending on rq wait may not result in satisfactory
* control quality. For a better control quality, completion latency QoS
* parameters can be configured so that the device is considered saturated
* if N'th percentile completion latency rises above the set point.
@ -556,7 +556,6 @@ struct ioc_now {
u64 now_ns;
u64 now;
u64 vnow;
u64 vrate;
};
struct iocg_wait {
@ -906,8 +905,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
if (idx == ioc->autop_idx && !force)
return false;
if (idx != ioc->autop_idx)
if (idx != ioc->autop_idx) {
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
ioc->vtime_base_rate = VTIME_PER_USEC;
}
ioc->autop_idx = idx;
ioc->autop_too_fast_at = 0;
@ -975,7 +976,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
if (ioc->busy_level != prev_busy_level || nr_lagging)
trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
trace_iocost_ioc_vrate_adj(ioc, vrate,
missed_ppm, rq_wait_pct,
nr_lagging, nr_shortages);
@ -1018,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
unsigned seq;
u64 vrate;
now->now_ns = ktime_get();
now->now = ktime_to_us(now->now_ns);
now->vrate = atomic64_read(&ioc->vtime_rate);
vrate = atomic64_read(&ioc->vtime_rate);
/*
* The current vtime is
@ -1034,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
do {
seq = read_seqcount_begin(&ioc->period_seqcount);
now->vnow = ioc->period_at_vtime +
(now->now - ioc->period_at) * now->vrate;
(now->now - ioc->period_at) * vrate;
} while (read_seqcount_retry(&ioc->period_seqcount, seq));
}
@ -2203,8 +2205,8 @@ static void ioc_timer_fn(struct timer_list *timer)
LIST_HEAD(surpluses);
int nr_debtors, nr_shortages = 0, nr_lagging = 0;
u64 usage_us_sum = 0;
u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
u32 ppm_rthr;
u32 ppm_wthr;
u32 missed_ppm[2], rq_wait_pct;
u64 period_vtime;
int prev_busy_level;
@ -2215,6 +2217,8 @@ static void ioc_timer_fn(struct timer_list *timer)
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
ioc_now(ioc, &now);
period_vtime = now.vnow - ioc->period_at_vtime;
@ -2878,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk)
spin_unlock_irq(&ioc->lock);
/*
* rqos must be added before activation to allow iocg_pd_init() to
* rqos must be added before activation to allow ioc_pd_init() to
* lookup the ioc from q. This means that the rqos methods may get
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
@ -3187,11 +3191,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
spin_lock_irq(&ioc->lock);
memcpy(qos, ioc->params.qos, sizeof(qos));
enable = ioc->enabled;
user = ioc->user_qos_params;
spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@ -3258,15 +3264,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (qos[QOS_MIN] > qos[QOS_MAX])
goto einval;
spin_lock_irq(&ioc->lock);
if (enable) {
blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
wbt_disable_default(disk->queue);
} else {
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
wbt_enable_default(disk->queue);
}
if (user) {
@ -3279,9 +3285,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
blk_mq_unfreeze_queue(disk->queue);
blkdev_put_no_open(bdev);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
blk_mq_unfreeze_queue(disk->queue);
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);
@ -3336,6 +3350,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
struct request_queue *q;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@ -3346,18 +3361,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
ioc = q_to_ioc(bdev_get_queue(bdev));
q = bdev_get_queue(bdev);
ioc = q_to_ioc(q);
if (!ioc) {
ret = blk_iocost_init(bdev->bd_disk);
if (ret)
goto err;
ioc = q_to_ioc(bdev_get_queue(bdev));
ioc = q_to_ioc(q);
}
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
spin_lock_irq(&ioc->lock);
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@ -3394,7 +3412,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
user = true;
}
spin_lock_irq(&ioc->lock);
if (user) {
memcpy(ioc->params.i_lcoefs, u, sizeof(u));
ioc->user_cost_model = true;
@ -3404,10 +3421,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
blkdev_put_no_open(bdev);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);

View File

@ -141,7 +141,7 @@ struct iolatency_grp {
struct latency_stat __percpu *stats;
struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
struct rq_depth rq_depth;
unsigned int max_depth;
struct rq_wait rq_wait;
atomic64_t window_start;
atomic_t scale_cookie;
@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
{
struct iolatency_grp *iolat = private_data;
return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
return rq_wait_inc_below(rqw, iolat->max_depth);
}
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
@ -364,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
}
/*
* Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
* Change the queue depth of the iolatency_grp. We add 1/16th of the
* queue depth at a time so we don't get wild swings and hopefully dial in to
* fairer distribution of the overall queue depth.
* fairer distribution of the overall queue depth. We halve the queue depth
* at a time so we can scale down queue depth quickly from default unlimited
* to target.
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = iolat->rq_depth.max_depth;
unsigned long old = iolat->max_depth;
if (old > qd)
old = qd;
@ -384,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
if (old < qd) {
old += scale;
old = min(old, qd);
iolat->rq_depth.max_depth = old;
iolat->max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
} else {
old >>= 1;
iolat->rq_depth.max_depth = max(old, 1UL);
iolat->max_depth = max(old, 1UL);
}
}
@ -403,9 +405,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
u64 scale_lat;
int direction = 0;
if (lat_to_blkg(iolat)->parent == NULL)
return;
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
if (!parent)
return;
@ -445,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
}
/* We're as low as we can go. */
if (iolat->rq_depth.max_depth == 1 && direction < 0) {
if (iolat->max_depth == 1 && direction < 0) {
blkcg_use_delay(lat_to_blkg(iolat));
return;
}
@ -453,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
/* We're back to the default cookie, unthrottle all the things. */
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
blkcg_clear_delay(lat_to_blkg(iolat));
iolat->rq_depth.max_depth = UINT_MAX;
iolat->max_depth = UINT_MAX;
wake_up_all(&iolat->rq_wait.wait);
return;
}
@ -508,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
* We don't want to count issue_as_root bio's in the cgroups latency
* statistics as it could skew the numbers downwards.
*/
if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
u64 sub = iolat->min_lat_nsec;
if (req_time < sub)
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
@ -920,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
}
preempt_enable();
if (iolat->rq_depth.max_depth == UINT_MAX)
if (iolat->max_depth == UINT_MAX)
seq_printf(s, " missed=%llu total=%llu depth=max",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total);
@ -928,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
seq_printf(s, " missed=%llu total=%llu depth=%u",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total,
iolat->rq_depth.max_depth);
iolat->max_depth);
}
static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
@ -945,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
if (iolat->rq_depth.max_depth == UINT_MAX)
if (iolat->max_depth == UINT_MAX)
seq_printf(s, " depth=max avg_lat=%llu win=%llu",
avg_lat, cur_win);
else
seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
iolat->rq_depth.max_depth, avg_lat, cur_win);
iolat->max_depth, avg_lat, cur_win);
}
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
@ -994,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
iolat->rq_depth.queue_depth = blkg->q->nr_requests;
iolat->rq_depth.max_depth = UINT_MAX;
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
iolat->max_depth = UINT_MAX;
iolat->blkiolat = blkiolat;
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
atomic64_set(&iolat->window_start, now);

View File

@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
{
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
unsigned int gup_flags = 0;
struct bio *bio;
int ret;
int j;
@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (bio == NULL)
return -ENOMEM;
if (blk_queue_pci_p2pdma(rq->q))
gup_flags |= FOLL_PCI_P2PDMA;
while (iov_iter_count(iter)) {
struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes;
@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
pages = stack_pages;
bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
nr_vecs, &offs);
bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
nr_vecs, &offs, gup_flags);
} else {
bytes = iov_iter_get_pages_alloc2(iter, &pages,
LONG_MAX, &offs);
bytes = iov_iter_get_pages_alloc(iter, &pages,
LONG_MAX, &offs, gup_flags);
}
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
@ -555,7 +559,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
size_t nr_iter = iov_iter_count(iter);
size_t nr_segs = iter->nr_segs;
struct bio_vec *bvecs, *bvprvp = NULL;
struct queue_limits *lim = &q->limits;
const struct queue_limits *lim = &q->limits;
unsigned int nsegs = 0, bytes = 0;
struct bio *bio;
size_t i;

View File

@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
* is defined as 'unsigned int', meantime it has to be aligned to with the
* logical block size, which is the minimum accepted unit by hardware.
*/
static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
static struct bio *bio_split_discard(struct bio *bio,
const struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
}
static struct bio *bio_split_write_zeroes(struct bio *bio,
struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
const struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
if (!lim->max_write_zeroes_sectors)
@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct bio *bio,
struct queue_limits *lim)
const struct queue_limits *lim)
{
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
@ -184,7 +186,15 @@ static inline unsigned get_max_io_size(struct bio *bio,
return max_sectors & ~(lbs - 1);
}
static inline unsigned get_max_segment_size(struct queue_limits *lim,
/**
* get_max_segment_size() - maximum number of bytes to add as a single segment
* @lim: Request queue limits.
* @start_page: See below.
* @offset: Offset from @start_page where to add a segment.
*
* Returns the maximum number of bytes that can be added as a single segment.
*/
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
struct page *start_page, unsigned long offset)
{
unsigned long mask = lim->seg_boundary_mask;
@ -192,11 +202,10 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
offset = mask & (page_to_phys(start_page) + offset);
/*
* overflow may be triggered in case of zero page physical address
* on 32bit arch, use queue's max segment size when that happens.
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
* after having calculated the minimum.
*/
return min_not_zero(mask - offset + 1,
(unsigned long)lim->max_segment_size);
return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
}
/**
@ -219,9 +228,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
unsigned *nsegs, unsigned *bytes, unsigned max_segs,
unsigned max_bytes)
static bool bvec_split_segs(const struct queue_limits *lim,
const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
unsigned max_segs, unsigned max_bytes)
{
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
@ -267,7 +276,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
@ -331,8 +340,9 @@ static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs)
struct bio *__bio_split_to_limits(struct bio *bio,
const struct queue_limits *lim,
unsigned int *nr_segs)
{
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
struct bio *split;
@ -377,7 +387,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
*/
struct bio *bio_split_to_limits(struct bio *bio)
{
struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
if (bio_may_exceed_limits(bio, lim))

View File

@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
unsigned int flags = q->tag_set->flags;
@ -563,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
if (!e) {
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
return 0;
}
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.

View File

@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
int i, ret;
int i, j, ret;
if (!hctx->nr_ctx)
return 0;
@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
break;
goto out;
}
return 0;
out:
hctx_for_each_ctx(hctx, ctx, j) {
if (j < i)
kobject_del(&ctx->kobj);
}
kobject_del(&hctx->kobj);
return ret;
}

View File

@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
* @q: request queue.
* @set: tag_set to wait on
*
* Note: it is driver's responsibility for making sure that quiesce has
* been started.
* been started on or more of the request_queues of the tag_set. This
* function only waits for the quiesce on those request_queues that had
* the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/
void blk_mq_wait_quiesce_done(struct request_queue *q)
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
if (blk_queue_has_srcu(q))
synchronize_srcu(q->srcu);
if (set->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(set->srcu);
else
synchronize_rcu();
}
@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
blk_mq_wait_quiesce_done(q);
/* nothing to wait for non-mq queues */
if (queue_is_mq(q))
blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
{
struct request_queue *q;
mutex_lock(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
if (!blk_queue_skip_tagset_quiesce(q))
blk_mq_quiesce_queue_nowait(q);
}
blk_mq_wait_quiesce_done(set);
mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
{
struct request_queue *q;
mutex_lock(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
if (!blk_queue_skip_tagset_quiesce(q))
blk_mq_unquiesce_queue(q);
}
mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
if (rq_list_empty(plug->cached_rq)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (rq)
goto got_it;
return NULL;
if (!rq)
return NULL;
} else {
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
plug->cached_rq = rq_list_next(rq);
}
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
plug->cached_rq = rq_list_next(rq);
got_it:
rq->cmd_flags = opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req)
blk_add_timer(req);
}
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
struct blk_expired_data {
bool has_timedout_rq;
unsigned long next;
unsigned long timeout_start;
};
static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
unsigned long deadline;
@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
deadline = READ_ONCE(rq->deadline);
if (time_after_eq(jiffies, deadline))
if (time_after_eq(expired->timeout_start, deadline))
return true;
if (*next == 0)
*next = deadline;
else if (time_after(*next, deadline))
*next = deadline;
if (expired->next == 0)
expired->next = deadline;
else if (time_after(expired->next, deadline))
expired->next = deadline;
return false;
}
@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq)
static bool blk_mq_check_expired(struct request *rq, void *priv)
{
unsigned long *next = priv;
struct blk_expired_data *expired = priv;
/*
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv)
* it was completed and reallocated as a new request after returning
* from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
if (blk_mq_req_expired(rq, expired)) {
expired->has_timedout_rq = true;
return false;
}
return true;
}
static bool blk_mq_handle_expired(struct request *rq, void *priv)
{
struct blk_expired_data *expired = priv;
if (blk_mq_req_expired(rq, expired))
blk_mq_rq_timed_out(rq);
return true;
}
@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
unsigned long next = 0;
struct blk_expired_data expired = {
.timeout_start = jiffies,
};
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
/* check if there is any timed-out request */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
if (expired.has_timedout_rq) {
/*
* Before walking tags, we must ensure any submit started
* before the current time has finished. Since the submit
* uses srcu or rcu, wait for a synchronization point to
* ensure all running submits have finished
*/
blk_mq_wait_quiesce_done(q->tag_set);
if (next != 0) {
mod_timer(&q->timeout, next);
expired.next = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
}
if (expired.next != 0) {
mod_timer(&q->timeout, expired.next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
return NULL;
}
if (!tags->rqs)
goto err_free_tags;
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags);
return NULL;
}
if (!tags->static_rqs)
goto err_free_rqs;
return tags;
err_free_rqs:
kfree(tags->rqs);
err_free_tags:
blk_mq_free_tags(tags);
return NULL;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
blk_queue_start_drain(q);
blk_freeze_queue(q);
blk_mq_freeze_queue_wait(q);
blk_sync_queue(q);
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_mq_destroy_queue(q);
blk_put_queue(q);
return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
WARN_ON_ONCE(blk_queue_has_srcu(q) !=
!!(set->flags & BLK_MQ_F_BLOCKING));
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
int cur_nr_hw_queues, int new_nr_hw_queues)
int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
if (cur_nr_hw_queues >= new_nr_hw_queues)
return 0;
if (set->nr_hw_queues >= new_nr_hw_queues)
goto done;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
return -ENOMEM;
if (set->tags)
memcpy(new_tags, set->tags, cur_nr_hw_queues *
memcpy(new_tags, set->tags, set->nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
done:
set->nr_hw_queues = new_nr_hw_queues;
return 0;
}
static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
int new_nr_hw_queues)
{
return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
}
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
return -ENOMEM;
if (set->flags & BLK_MQ_F_BLOCKING) {
set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
if (!set->srcu)
return -ENOMEM;
ret = init_srcu_struct(set->srcu);
if (ret)
goto out_free_srcu;
}
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
goto out_cleanup_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
@ -4437,6 +4503,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
}
kfree(set->tags);
set->tags = NULL;
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
out_free_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
if (set->flags & BLK_MQ_F_BLOCKING) {
cleanup_srcu_struct(set->srcu);
kfree(set->srcu);
}
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
/* keep a reference to the elevator module as we'll switch back */
__elevator_get(qe->type);
list_add(&qe->node, head);
/*
* After elevator_switch, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be
* removed.
*/
__module_get(qe->type->elevator_owner);
elevator_switch(q, NULL);
elevator_disable(q);
mutex_unlock(&q->sysfs_lock);
return true;
@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
mutex_unlock(&q->sysfs_lock);
}
@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
}
prev_nr_hw_queues = set->nr_hw_queues;
if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
0)
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto reregister;
set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
void blk_mq_cancel_work_sync(struct request_queue *q)
{
if (queue_is_mq(q)) {
struct blk_mq_hw_ctx *hctx;
unsigned long i;
struct blk_mq_hw_ctx *hctx;
unsigned long i;
cancel_delayed_work_sync(&q->requeue_work);
cancel_delayed_work_sync(&q->requeue_work);
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
}
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
}
static int __init blk_mq_init(void)

View File

@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
if (!blk_queue_has_srcu(q)) { \
rcu_read_lock(); \
(dispatch_ops); \
rcu_read_unlock(); \
} else { \
if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \
int srcu_idx; \
\
might_sleep_if(check_sleep); \
srcu_idx = srcu_read_lock((q)->srcu); \
srcu_idx = srcu_read_lock((q)->tag_set->srcu); \
(dispatch_ops); \
srcu_read_unlock((q)->srcu, srcu_idx); \
srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
} else { \
rcu_read_lock(); \
(dispatch_ops); \
rcu_read_unlock(); \
} \
} while (0)

View File

@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
}
EXPORT_SYMBOL(blk_queue_io_opt);
static int queue_limit_alignment_offset(struct queue_limits *lim,
static int queue_limit_alignment_offset(const struct queue_limits *lim,
sector_t sector)
{
unsigned int granularity = max(lim->physical_block_size, lim->io_min);
@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim,
return (granularity + lim->alignment_offset - alignment) % granularity;
}
static unsigned int queue_limit_discard_alignment(struct queue_limits *lim,
sector_t sector)
static unsigned int queue_limit_discard_alignment(
const struct queue_limits *lim, sector_t sector)
{
unsigned int alignment, granularity, offset;

View File

@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
if (!wbt_rq_qos(q))
return -EINVAL;
if (wbt_disabled(q))
return sprintf(page, "0\n");
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
@ -680,8 +683,8 @@ static struct attribute *queue_attrs[] = {
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
if (attr == &queue_io_timeout_entry.attr &&
(!q->mq_ops || !q->mq_ops->timeout))
@ -707,8 +710,8 @@ static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->show)
@ -724,68 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct request_queue *q;
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->store)
return -EIO;
q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
/**
* blk_release_queue - releases all allocated resources of the request_queue
* @kobj: pointer to a kobject, whose container is a request_queue
*
* This function releases all allocated resources of the request queue.
*
* The struct request_queue refcount is incremented with blk_get_queue() and
* decremented with blk_put_queue(). Once the refcount reaches 0 this function
* is called.
*
* Drivers exist which depend on the release of the request_queue to be
* synchronous, it should not be deferred.
*
* Context: can sleep
*/
static void blk_release_queue(struct kobject *kobj)
{
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
might_sleep();
percpu_ref_exit(&q->q_usage_counter);
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
if (queue_is_mq(q))
blk_mq_release(q);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
ida_free(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
@ -796,12 +750,30 @@ static const struct attribute_group *blk_queue_attr_groups[] = {
NULL
};
struct kobj_type blk_queue_ktype = {
static void blk_queue_release(struct kobject *kobj)
{
/* nothing to do here, all data is associated with the parent gendisk */
}
static struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_release_queue,
.release = blk_queue_release,
};
static void blk_debugfs_remove(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
mutex_lock(&q->debugfs_mutex);
blk_trace_shutdown(q);
debugfs_remove_recursive(q->debugfs_dir);
q->debugfs_dir = NULL;
q->sched_debugfs_dir = NULL;
q->rqos_debugfs_dir = NULL;
mutex_unlock(&q->debugfs_mutex);
}
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
@ -812,47 +784,47 @@ int blk_register_queue(struct gendisk *disk)
int ret;
mutex_lock(&q->sysfs_dir_lock);
ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
goto unlock;
goto out_put_queue_kobj;
if (queue_is_mq(q))
blk_mq_sysfs_register(disk);
if (queue_is_mq(q)) {
ret = blk_mq_sysfs_register(disk);
if (ret)
goto out_put_queue_kobj;
}
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->debugfs_mutex);
q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
blk_debugfs_root);
q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
if (queue_is_mq(q))
blk_mq_debugfs_register(q);
mutex_unlock(&q->debugfs_mutex);
ret = disk_register_independent_access_ranges(disk);
if (ret)
goto put_dev;
goto out_debugfs_remove;
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret)
goto put_dev;
goto out_unregister_ia_ranges;
}
ret = blk_crypto_sysfs_register(q);
ret = blk_crypto_sysfs_register(disk);
if (ret)
goto put_dev;
goto out_elv_unregister;
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&q->kobj, KOBJ_ADD);
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
if (q->elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
unlock:
mutex_unlock(&q->sysfs_dir_lock);
/*
@ -871,13 +843,16 @@ int blk_register_queue(struct gendisk *disk)
return ret;
put_dev:
out_elv_unregister:
elv_unregister_queue(q);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
return ret;
}
@ -915,7 +890,7 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(q);
blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
elv_unregister_queue(q);
@ -923,15 +898,9 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock);
/* Now that we've deleted all child objects, we can delete the queue. */
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
mutex_lock(&q->debugfs_mutex);
blk_trace_shutdown(q);
debugfs_remove_recursive(q->debugfs_dir);
q->debugfs_dir = NULL;
q->sched_debugfs_dir = NULL;
q->rqos_debugfs_dir = NULL;
mutex_unlock(&q->debugfs_mutex);
blk_debugfs_remove(disk);
}

View File

@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
/*
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
* make the IO dispatch more smooth.
* Scale up: linearly scale up according to lapsed time since upgrade. For
* Scale up: linearly scale up according to elapsed time since upgrade. For
* every throtl_slice, the limit scales up 1/2 .low limit till the
* limit hits .max limit
* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
* If on the default hierarchy, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
* read_bps limit is set on the root group, the whole system can't
* exceed 16M for the device.
* read_bps limit is set on a parent group, summary bps of
* parent group and its subtree groups can't exceed 16M for the
* device.
*
* If not on the default hierarchy, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
@ -644,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
* that bandwidth. Do try to make use of that bandwidth while giving
* credit.
*/
if (time_after_eq(start, tg->slice_start[rw]))
if (time_after(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
@ -821,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg)
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
}
static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit, unsigned long *wait)
static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) {
if (wait)
*wait = 0;
return true;
return 0;
}
jiffy_elapsed = jiffies - tg->slice_start[rw];
@ -841,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
tg->carryover_ios[rw];
if (tg->io_disp[rw] + 1 <= io_allowed) {
if (wait)
*wait = 0;
return true;
return 0;
}
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
if (wait)
*wait = jiffy_wait;
return false;
return jiffy_wait;
}
static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
u64 bps_limit, unsigned long *wait)
static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
u64 bps_limit)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes;
@ -864,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
if (wait)
*wait = 0;
return true;
return 0;
}
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@ -879,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
tg->carryover_bytes[rw];
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait)
*wait = 0;
return true;
return 0;
}
/* Calc approx time to dispatch */
@ -896,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
* up we did. Add that time also.
*/
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
if (wait)
*wait = jiffy_wait;
return false;
return jiffy_wait;
}
/*
@ -946,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
return true;
@ -1066,7 +1055,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
bio_set_flag(bio, BIO_BPS_THROTTLED);
/*
* If our parent is another tg, we just need to transfer @bio to
@ -1079,6 +1067,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
@ -1737,7 +1726,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
tg->flags |= THROTL_TG_CANCELING;
/*
* Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
* will be inserted to service queue without THROTL_TG_PENDING
* set in tg_update_disptime below. Then IO dispatched from
* child in tg_dispatch_one_bio will trigger double insertion
* and corrupt the tree.
*/
if (!(tg->flags & THROTL_TG_PENDING))
continue;
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
return min(rtime, wtime);
}
/* tg should not be an intermediate node */
static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq;
@ -1816,24 +1815,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
return ret;
}
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
bool read_limit, write_limit;
bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
/*
* if cgroup reaches low limit (if low limit is 0, the cgroup always
* reaches), it's ok to upgrade to next limit
* if low limit is zero, low limit is always reached.
* if low limit is non-zero, we can check if there is any request
* is queued to determine if low limit is reached as we throttle
* request according to limit.
*/
read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
if (!read_limit && !write_limit)
return true;
if (read_limit && sq->nr_queued[READ] &&
(!write_limit || sq->nr_queued[WRITE]))
return true;
if (write_limit && sq->nr_queued[WRITE] &&
(!read_limit || sq->nr_queued[READ]))
return !limit || sq->nr_queued[rw];
}
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
{
/*
* cgroup reaches low limit when low limit of READ and WRITE are
* both reached, it's ok to upgrade to next limit if cgroup reaches
* low limit
*/
if (throtl_low_limit_reached(tg, READ) &&
throtl_low_limit_reached(tg, WRITE))
return true;
if (time_after_eq(jiffies,
@ -1951,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
* If cgroup is below low limit, consider downgrade and throttle other
* cgroups
*/
if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
time_after_eq(now, tg_last_low_overflow_time(tg) +
if (time_after_eq(now, tg_last_low_overflow_time(tg) +
td->throtl_slice) &&
(!throtl_tg_is_idle(tg) ||
!list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
@ -1962,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
{
struct throtl_data *td = tg->td;
if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
return false;
while (true) {
if (!throtl_tg_can_downgrade(tg))
return false;

View File

@ -27,6 +27,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
bool wbt_disabled(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
}
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
struct rq_qos *rqos = wbt_rq_qos(q);
if (!rqos)
return;
RQWB(rqos)->min_lat_nsec = val;
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
if (val)
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
else
RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
wbt_update_limits(RQWB(rqos));
}
@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
*/
void wbt_enable_default(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
struct rq_qos *rqos;
bool disable_flag = q->elevator &&
test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
/* Throttling already enabled? */
rqos = wbt_rq_qos(q);
if (rqos) {
if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
if (!disable_flag &&
RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
}
@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q)
if (!blk_queue_registered(q))
return;
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
if (queue_is_mq(q) && !disable_flag)
wbt_init(q);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);

View File

@ -28,13 +28,15 @@ enum {
};
/*
* Enable states. Either off, or on by default (done at init time),
* or on through manual setup in sysfs.
* If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
* state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
* to WBT_STATE_OFF/ON_MANUAL.
*/
enum {
WBT_STATE_ON_DEFAULT = 1,
WBT_STATE_ON_MANUAL = 2,
WBT_STATE_OFF_DEFAULT
WBT_STATE_ON_DEFAULT = 1, /* on by default */
WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
WBT_STATE_OFF_DEFAULT = 3, /* off by default */
WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
};
struct rq_wb {
@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
bool wbt_disabled(struct request_queue *);
void wbt_set_write_cache(struct request_queue *, bool);
@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q)
{
return 0;
}
static inline bool wbt_disabled(struct request_queue *q)
{
return true;
}
#endif /* CONFIG_BLK_WBT */

View File

@ -26,11 +26,6 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@ -104,7 +99,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
return (offset & lim->virt_boundary_mask) ||
@ -115,7 +110,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
static inline bool bvec_gap_to_prev(struct queue_limits *lim,
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
if (!lim->virt_boundary_mask)
@ -278,6 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
void blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
@ -297,7 +293,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
static inline bool bio_may_exceed_limits(struct bio *bio,
struct queue_limits *lim)
const struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@ -320,8 +316,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs);
struct bio *__bio_split_to_limits(struct bio *bio,
const struct queue_limits *lim,
unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -428,15 +425,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
{
if (srcu)
return blk_requestq_srcu_cachep;
return blk_requestq_cachep;
}
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
struct request_queue *blk_alloc_queue(int node_id);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner);
int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);

View File

@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q)
bsg_unregister_queue(bset->bd);
blk_mq_destroy_queue(q);
blk_put_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
return q;
out_cleanup_queue:
blk_mq_destroy_queue(q);
blk_put_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:

View File

@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev)
void bsg_unregister_queue(struct bsg_device *bd)
{
if (bd->queue->kobj.sd)
sysfs_remove_link(&bd->queue->kobj, "bsg");
struct gendisk *disk = bd->queue->disk;
if (disk && disk->queue_kobj.sd)
sysfs_remove_link(&disk->queue_kobj, "bsg");
cdev_device_del(&bd->cdev, &bd->device);
put_device(&bd->device);
}
@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
if (ret)
goto out_put_device;
if (q->kobj.sd) {
ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg");
if (q->disk && q->disk->queue_kobj.sd) {
ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj,
"bsg");
if (ret)
goto out_device_del;
}

View File

@ -57,7 +57,7 @@ static LIST_HEAD(elv_list);
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
*/
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
return 1;
return true;
}
/*
@ -83,78 +83,45 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
static inline bool elv_support_features(unsigned int elv_features,
unsigned int required_features)
static inline bool elv_support_features(struct request_queue *q,
const struct elevator_type *e)
{
return (required_features & elv_features) == required_features;
return (q->required_elevator_features & e->elevator_features) ==
q->required_elevator_features;
}
/**
* elevator_match - Test an elevator name and features
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
* @name: Elevator name to test
* @required_features: Features that the elevator must provide
*
* Return true if the elevator @e name matches @name and if @e provides all
* the features specified by @required_features.
* Return true if the elevator @e's name or alias matches @name.
*/
static bool elevator_match(const struct elevator_type *e, const char *name,
unsigned int required_features)
static bool elevator_match(const struct elevator_type *e, const char *name)
{
if (!elv_support_features(e->elevator_features, required_features))
return false;
if (!strcmp(e->elevator_name, name))
return true;
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
return true;
return false;
return !strcmp(e->elevator_name, name) ||
(e->elevator_alias && !strcmp(e->elevator_alias, name));
}
/**
* elevator_find - Find an elevator
* @name: Name of the elevator to find
* @required_features: Features that the elevator must provide
*
* Return the first registered scheduler with name @name and supporting the
* features @required_features and NULL otherwise.
*/
static struct elevator_type *elevator_find(const char *name,
unsigned int required_features)
static struct elevator_type *__elevator_find(const char *name)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
if (elevator_match(e, name, required_features))
list_for_each_entry(e, &elv_list, list)
if (elevator_match(e, name))
return e;
}
return NULL;
}
static void elevator_put(struct elevator_type *e)
{
module_put(e->elevator_owner);
}
static struct elevator_type *elevator_get(struct request_queue *q,
const char *name, bool try_loading)
static struct elevator_type *elevator_find_get(struct request_queue *q,
const char *name)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
e = elevator_find(name, q->required_elevator_features);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
e = elevator_find(name, q->required_elevator_features);
}
if (e && !try_module_get(e->elevator_owner))
e = __elevator_find(name);
if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
}
@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
if (unlikely(!eq))
return NULL;
__elevator_get(e);
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
@ -499,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
lockdep_assert_held(&q->sysfs_lock);
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
@ -512,7 +480,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
e->registered = 1;
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
@ -523,13 +491,9 @@ void elv_unregister_queue(struct request_queue *q)
lockdep_assert_held(&q->sysfs_lock);
if (e && e->registered) {
struct elevator_queue *e = q->elevator;
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
e->registered = 0;
}
}
@ -555,7 +519,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
if (elevator_find(e->elevator_name, 0)) {
if (__elevator_find(e->elevator_name)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@ -588,39 +552,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
static int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e)
{
int ret;
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
elv_unregister_queue(q);
elevator_exit(q);
}
ret = blk_mq_init_sched(q, new_e);
if (ret)
goto out;
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q);
goto out;
}
}
if (new_e)
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
else
blk_add_trace_msg(q, "elv switch: none");
out:
return ret;
}
static inline bool elv_support_iosched(struct request_queue *q)
{
if (!queue_is_mq(q) ||
@ -642,7 +573,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
return elevator_get(q, "mq-deadline", false);
return elevator_find_get(q, "mq-deadline");
}
/*
@ -656,14 +587,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q)
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
if (elv_support_features(e->elevator_features,
q->required_elevator_features)) {
if (elv_support_features(q, e)) {
found = e;
break;
}
}
if (found && !try_module_get(found->elevator_owner))
if (found && !elevator_tryget(found))
found = NULL;
spin_unlock(&elv_list_lock);
@ -713,115 +643,147 @@ void elevator_init_mq(struct request_queue *q)
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
elevator_put(e);
}
elevator_put(e);
}
/*
* switch to new_e io scheduler. be careful not to introduce deadlocks -
* we don't free the old io scheduler, before we have allocated what we
* need for the new one. this way we have a chance of going back to the old
* one, if the new one fails init for some reason.
* Switch to new_e io scheduler.
*
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
int err;
int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
err = elevator_switch_mq(q, new_e);
if (q->elevator) {
elv_unregister_queue(q);
elevator_exit(q);
}
ret = blk_mq_init_sched(q, new_e);
if (ret)
goto out_unfreeze;
ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q);
goto out_unfreeze;
}
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
out_unfreeze:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return err;
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
new_e->elevator_name);
}
return ret;
}
void elevator_disable(struct request_queue *q)
{
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
elv_unregister_queue(q);
elevator_exit(q);
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
blk_add_trace_msg(q, "elv switch: none");
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
}
/*
* Switch this queue to the given IO scheduler.
*/
static int __elevator_change(struct request_queue *q, const char *name)
static int elevator_change(struct request_queue *q, const char *elevator_name)
{
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
int ret;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
/*
* Special case for mq, turn off scheduling
*/
if (!strncmp(name, "none", 4)) {
if (!q->elevator)
return 0;
return elevator_switch(q, NULL);
}
strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(q, strstrip(elevator_name), true);
if (!e)
return -EINVAL;
if (q->elevator &&
elevator_match(q->elevator->type, elevator_name, 0)) {
elevator_put(e);
if (!strncmp(elevator_name, "none", 4)) {
if (q->elevator)
elevator_disable(q);
return 0;
}
return elevator_switch(q, e);
if (q->elevator && elevator_match(q->elevator->type, elevator_name))
return 0;
e = elevator_find_get(q, elevator_name);
if (!e) {
request_module("%s-iosched", elevator_name);
e = elevator_find_get(q, elevator_name);
if (!e)
return -EINVAL;
}
ret = elevator_switch(q, e);
elevator_put(e);
return ret;
}
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
int ret;
if (!elv_support_iosched(q))
return count;
ret = __elevator_change(q, name);
strlcpy(elevator_name, buf, sizeof(elevator_name));
ret = elevator_change(q, strstrip(elevator_name));
if (!ret)
return count;
return ret;
}
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
struct elevator_queue *e = q->elevator;
struct elevator_type *elv = NULL;
struct elevator_type *__e;
struct elevator_queue *eq = q->elevator;
struct elevator_type *cur = NULL, *e;
int len = 0;
if (!queue_is_mq(q))
if (!elv_support_iosched(q))
return sprintf(name, "none\n");
if (!q->elevator)
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
else
elv = e->type;
} else {
len += sprintf(name+len, "none ");
cur = eq->type;
}
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
if (elv && elevator_match(elv, __e->elevator_name, 0)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
continue;
}
if (elv_support_iosched(q) &&
elevator_match(__e, __e->elevator_name,
q->required_elevator_features))
len += sprintf(name+len, "%s ", __e->elevator_name);
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
else if (elv_support_features(q, e))
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
if (q->elevator)
len += sprintf(name+len, "none");
len += sprintf(len+name, "\n");
len += sprintf(name+len, "\n");
return len;
}

View File

@ -84,6 +84,21 @@ struct elevator_type
struct list_head list;
};
static inline bool elevator_tryget(struct elevator_type *e)
{
return try_module_get(e->elevator_owner);
}
static inline void __elevator_get(struct elevator_type *e)
{
__module_get(e->elevator_owner);
}
static inline void elevator_put(struct elevator_type *e)
{
module_put(e->elevator_owner);
}
#define ELV_HASH_BITS 6
void elv_rqhash_del(struct request_queue *q, struct request *rq);
@ -100,10 +115,13 @@ struct elevator_queue
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
unsigned int registered:1;
unsigned long flags;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
#define ELEVATOR_FLAG_REGISTERED 0
#define ELEVATOR_FLAG_DISABLE_WBT 1
/*
* block elevator interface
*/

View File

@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
return ret;
}
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return generic_writepages(mapping, wbc);
}
const struct address_space_operations def_blk_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = {
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = blkdev_writepages,
.direct_IO = blkdev_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,

View File

@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
}
EXPORT_SYMBOL_GPL(disk_uevent);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner)
{
struct block_device *bdev;
@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
/* Someone else has bdev exclusively open? */
if (disk->part0->bd_holder && disk->part0->bd_holder != owner)
return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
@ -479,10 +482,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_put_holder_dir;
}
ret = bd_register_pending_holders(disk);
if (ret < 0)
goto out_put_slave_dir;
ret = blk_register_queue(disk);
if (ret)
goto out_put_slave_dir;
@ -500,7 +499,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
bdev_add(disk->part0, ddev->devt);
if (get_capacity(disk))
disk_scan_partitions(disk, FMODE_READ);
disk_scan_partitions(disk, FMODE_READ, NULL);
/*
* Announce the disk and partitions after all partitions are
@ -530,6 +529,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
rq_qos_exit(disk->queue);
out_put_slave_dir:
kobject_put(disk->slave_dir);
disk->slave_dir = NULL;
out_put_holder_dir:
kobject_put(disk->part0->bd_holder_dir);
out_del_integrity:
@ -560,6 +560,11 @@ void blk_mark_disk_dead(struct gendisk *disk)
{
set_bit(GD_DEAD, &disk->state);
blk_queue_start_drain(disk->queue);
/*
* Stop buffered writers from dirtying pages that can't be written out.
*/
set_capacity_and_notify(disk, 0);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
@ -629,6 +634,7 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0->bd_holder_dir);
kobject_put(disk->slave_dir);
disk->slave_dir = NULL;
part_stat_set_all(disk->part0, 0);
disk->part0->bd_stamp = 0;
@ -643,7 +649,9 @@ void del_gendisk(struct gendisk *disk)
blk_sync_queue(q);
blk_flush_integrity();
blk_mq_cancel_work_sync(q);
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
@ -1193,21 +1201,10 @@ struct class block_class = {
.dev_uevent = block_uevent,
};
static char *block_devnode(struct device *dev, umode_t *mode,
kuid_t *uid, kgid_t *gid)
{
struct gendisk *disk = dev_to_disk(dev);
if (disk->fops->devnode)
return disk->fops->devnode(disk, mode);
return NULL;
}
const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
.devnode = block_devnode,
};
#ifdef CONFIG_PROC_FS
@ -1412,7 +1409,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
q = blk_alloc_queue(node, false);
q = blk_alloc_queue(node);
if (!q)
return NULL;

View File

@ -4,7 +4,7 @@
struct bd_holder_disk {
struct list_head list;
struct block_device *bdev;
struct kobject *holder_dir;
int refcnt;
};
@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct bd_holder_disk *holder;
list_for_each_entry(holder, &disk->slave_bdevs, list)
if (holder->bdev == bdev)
if (holder->holder_dir == bdev->bd_holder_dir)
return holder;
return NULL;
}
@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to)
sysfs_remove_link(from, kobject_name(to));
}
static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
int ret;
ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
if (ret)
return ret;
ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
if (ret)
del_symlink(disk->slave_dir, bdev_kobj(bdev));
return ret;
}
/**
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
* @bdev: the claimed slave bdev
@ -75,12 +62,30 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;
mutex_lock(&disk->open_mutex);
if (WARN_ON_ONCE(!disk->slave_dir))
return -EINVAL;
if (bdev->bd_disk == disk)
return -EINVAL;
/*
* del_gendisk drops the initial reference to bd_holder_dir, so we
* need to keep our own here to allow for cleanup past that point.
*/
mutex_lock(&bdev->bd_disk->open_mutex);
if (!disk_live(bdev->bd_disk)) {
mutex_unlock(&bdev->bd_disk->open_mutex);
return -ENODEV;
}
kobject_get(bdev->bd_holder_dir);
mutex_unlock(&bdev->bd_disk->open_mutex);
mutex_lock(&disk->open_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
if (holder) {
kobject_put(bdev->bd_holder_dir);
holder->refcnt++;
goto out_unlock;
}
@ -92,36 +97,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
}
INIT_LIST_HEAD(&holder->list);
holder->bdev = bdev;
holder->refcnt = 1;
if (disk->slave_dir) {
ret = __link_disk_holder(bdev, disk);
if (ret) {
kfree(holder);
goto out_unlock;
}
}
holder->holder_dir = bdev->bd_holder_dir;
ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
if (ret)
goto out_free_holder;
ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
if (ret)
goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
/*
* del_gendisk drops the initial reference to bd_holder_dir, so we need
* to keep our own here to allow for cleanup past that point.
*/
kobject_get(bdev->bd_holder_dir);
mutex_unlock(&disk->open_mutex);
return 0;
out_del_symlink:
del_symlink(disk->slave_dir, bdev_kobj(bdev));
out_free_holder:
kfree(holder);
out_unlock:
mutex_unlock(&disk->open_mutex);
if (ret)
kobject_put(bdev->bd_holder_dir);
return ret;
}
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
static void __unlink_disk_holder(struct block_device *bdev,
struct gendisk *disk)
{
del_symlink(disk->slave_dir, bdev_kobj(bdev));
del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
}
/**
* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
* @bdev: the calimed slave bdev
@ -136,36 +137,18 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
struct bd_holder_disk *holder;
if (WARN_ON_ONCE(!disk->slave_dir))
return;
mutex_lock(&disk->open_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
if (disk->slave_dir)
__unlink_disk_holder(bdev, disk);
kobject_put(bdev->bd_holder_dir);
del_symlink(disk->slave_dir, bdev_kobj(bdev));
del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj);
kobject_put(holder->holder_dir);
list_del_init(&holder->list);
kfree(holder);
}
mutex_unlock(&disk->open_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
int bd_register_pending_holders(struct gendisk *disk)
{
struct bd_holder_disk *holder;
int ret;
mutex_lock(&disk->open_mutex);
list_for_each_entry(holder, &disk->slave_bdevs, list) {
ret = __link_disk_holder(holder->bdev, disk);
if (ret)
goto out_undo;
}
mutex_unlock(&disk->open_mutex);
return 0;
out_undo:
list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list)
__unlink_disk_holder(holder->bdev, disk);
mutex_unlock(&disk->open_mutex);
return ret;
}

View File

@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
* user space. Note the separate arg/argp parameters that are needed
* to deal with the compat_ptr() conversion.
*/
static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg, void __user *argp)
static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd,
unsigned long arg, void __user *argp)
{
struct block_device *bdev = I_BDEV(file->f_mapping->host);
unsigned int max_sectors;
switch (cmd) {
@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL,
file);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret != -ENOIOCTLCMD)
return ret;
@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);

View File

@ -130,6 +130,20 @@ static u8 dd_rq_ioclass(struct request *rq)
return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
}
/*
* get the request before `rq' in sector-sorted order
*/
static inline struct request *
deadline_earlier_request(struct request *rq)
{
struct rb_node *node = rb_prev(&rq->rb_node);
if (node)
return rb_entry_rq(node);
return NULL;
}
/*
* get the request after `rq' in sector-sorted order
*/
@ -277,6 +291,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
return 0;
}
/*
* Check if rq has a sequential request preceding it.
*/
static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
{
struct request *prev = deadline_earlier_request(rq);
if (!prev)
return false;
return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
}
/*
* Skip all write requests that are sequential from @rq, even if we cross
* a zone boundary.
*/
static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
struct request *rq)
{
sector_t pos = blk_rq_pos(rq);
sector_t skipped_sectors = 0;
while (rq) {
if (blk_rq_pos(rq) != pos + skipped_sectors)
break;
skipped_sectors += blk_rq_sectors(rq);
rq = deadline_latter_request(rq);
}
return rq;
}
/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
* an unlocked target zone. For some HDDs, breaking a sequential
* write stream can lead to lower throughput, so make sure to preserve
* sequential write streams, even if that stream crosses into the next
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
if (blk_req_can_dispatch_to_zone(rq) &&
(blk_queue_nonrot(rq->q) ||
!deadline_is_seq_write(dd, rq)))
goto out;
}
rq = NULL;
@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
* an unlocked target zone. For some HDDs, breaking a sequential
* write stream can lead to lower throughput, so make sure to preserve
* sequential write streams, even if that stream crosses into the next
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
rq = deadline_latter_request(rq);
if (blk_queue_nonrot(rq->q))
rq = deadline_latter_request(rq);
else
rq = deadline_skip_seq_writes(dd, rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
enum dd_prio p;
for (p = 0; p <= DD_PRIO_MAX; p++)
if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
return true;
return false;
}
/*
* Callback from inside blk_mq_free_request().
*
@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
if (dd_has_write_work(rq->mq_hctx))
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
}
}

View File

@ -2461,6 +2461,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
}
static void opal_lock_check_for_saved_key(struct opal_dev *dev,
struct opal_lock_unlock *lk_unlk)
{
struct opal_suspend_data *iter;
if (lk_unlk->l_state != OPAL_LK ||
lk_unlk->session.opal_key.key_len > 0)
return;
/*
* Usually when closing a crypto device (eg: dm-crypt with LUKS) the
* volume key is not required, as it requires root privileges anyway,
* and root can deny access to a disk in many ways regardless.
* Requiring the volume key to lock the device is a peculiarity of the
* OPAL specification. Given we might already have saved the key if
* the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use
* that key to lock the device if no key was provided here, the
* locking range matches and the appropriate flag was passed with
* 'IOC_OPAL_SAVE'.
* This allows integrating OPAL with tools and libraries that are used
* to the common behaviour and do not ask for the volume key when
* closing a device.
*/
setup_opal_dev(dev);
list_for_each_entry(iter, &dev->unlk_lst, node) {
if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) &&
iter->lr == lk_unlk->session.opal_key.lr &&
iter->unlk.session.opal_key.key_len > 0) {
lk_unlk->session.opal_key.key_len =
iter->unlk.session.opal_key.key_len;
memcpy(lk_unlk->session.opal_key.key,
iter->unlk.session.opal_key.key,
iter->unlk.session.opal_key.key_len);
break;
}
}
}
static int opal_lock_unlock(struct opal_dev *dev,
struct opal_lock_unlock *lk_unlk)
{
@ -2470,6 +2508,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
return -EINVAL;
mutex_lock(&dev->dev_lock);
opal_lock_check_for_saved_key(dev, lk_unlk);
ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);

View File

@ -285,49 +285,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media (DEPRECATED)"
depends on !UML
depends on SCSI
select CDROM
help
Note: This driver is deprecated and will be removed from the
kernel in the near future!
If you have a CDROM/DVD drive that supports packet writing, say
Y to include support. It should work with any MMC/Mt Fuji
compliant ATAPI or SCSI drive, which is just about any newer
DVD/CD writer.
Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
is possible.
DVD-RW disks must be in restricted overwrite mode.
See the file <file:Documentation/cdrom/packet-writing.rst>
for further information on the use of this driver.
To compile this driver as a module, choose M here: the
module will be called pktcdvd.
config CDROM_PKTCDVD_BUFFERS
int "Free buffers for data gathering"
depends on CDROM_PKTCDVD
default "8"
help
This controls the maximum number of active concurrent packets. More
concurrent packets can increase write performance, but also require
more memory. Each concurrent packet will require approximately 64Kb
of non-swappable kernel memory, memory which will be allocated when
a disc is opened for writing.
config CDROM_PKTCDVD_WCACHE
bool "Enable write caching"
depends on CDROM_PKTCDVD
help
If enabled, write caching will be set for the CD-R/W device. For now
this option is dangerous unless the CD-RW media is known good, as we
don't do deferred write error handling yet.
config ATA_OVER_ETH
tristate "ATA over Ethernet support"
depends on NET

View File

@ -20,7 +20,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_N64CART) += n64cart.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o

View File

@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
# SPDX-License-Identifier: GPL-2.0-only
#
# DRBD device driver configuration
#

View File

@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
# SPDX-License-Identifier: GPL-2.0-only
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_actlog.c
@ -868,9 +868,9 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
nr_sectors = get_capacity(device->vdisk);
esector = sector + (size >> 9) - 1;
if (!expect(sector < nr_sectors))
if (!expect(device, sector < nr_sectors))
goto out;
if (!expect(esector < nr_sectors))
if (!expect(device, esector < nr_sectors))
esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@ -1143,7 +1143,7 @@ void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
spin_unlock_irqrestore(&device->al_lock, flags);
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
return;
}

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_bitmap.c
@ -113,7 +113,7 @@ struct drbd_bitmap {
static void __bm_print_lock_info(struct drbd_device *device, const char *func)
{
struct drbd_bitmap *b = device->bitmap;
if (!__ratelimit(&drbd_ratelimit_state))
if (!drbd_ratelimit())
return;
drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
current->comm, task_pid_nr(current),
@ -448,7 +448,7 @@ int drbd_bm_init(struct drbd_device *device)
sector_t drbd_bm_capacity(struct drbd_device *device)
{
if (!expect(device->bitmap))
if (!expect(device, device->bitmap))
return 0;
return device->bitmap->bm_dev_capacity;
}
@ -457,7 +457,7 @@ sector_t drbd_bm_capacity(struct drbd_device *device)
*/
void drbd_bm_cleanup(struct drbd_device *device)
{
if (!expect(device->bitmap))
if (!expect(device, device->bitmap))
return;
bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
bm_vk_free(device->bitmap->bm_pages);
@ -636,7 +636,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
int err = 0;
bool growing;
if (!expect(b))
if (!expect(device, b))
return -ENOMEM;
drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
@ -757,9 +757,9 @@ unsigned long _drbd_bm_total_weight(struct drbd_device *device)
unsigned long s;
unsigned long flags;
if (!expect(b))
if (!expect(device, b))
return 0;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@ -783,9 +783,9 @@ unsigned long drbd_bm_total_weight(struct drbd_device *device)
size_t drbd_bm_words(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
if (!expect(b))
if (!expect(device, b))
return 0;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 0;
return b->bm_words;
@ -794,7 +794,7 @@ size_t drbd_bm_words(struct drbd_device *device)
unsigned long drbd_bm_bits(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
if (!expect(b))
if (!expect(device, b))
return 0;
return b->bm_bits;
@ -816,9 +816,9 @@ void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
end = offset + number;
if (!expect(b))
if (!expect(device, b))
return;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return;
if (number == 0)
return;
@ -863,9 +863,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
end = offset + number;
if (!expect(b))
if (!expect(device, b))
return;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@ -894,9 +894,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
if (!expect(b))
if (!expect(device, b))
return;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@ -910,9 +910,9 @@ void drbd_bm_set_all(struct drbd_device *device)
void drbd_bm_clear_all(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
if (!expect(b))
if (!expect(device, b))
return;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@ -952,7 +952,7 @@ static void drbd_bm_endio(struct bio *bio)
bm_set_page_io_err(b->bm_pages[idx]);
/* Not identical to on disk version of it.
* Is BM_PAGE_IO_ERROR enough? */
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
bio->bi_status, idx);
} else {
@ -1013,7 +1013,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
else
len = PAGE_SIZE;
} else {
if (__ratelimit(&drbd_ratelimit_state)) {
if (drbd_ratelimit()) {
drbd_err(device, "Invalid offset during on-disk bitmap access: "
"page idx %u, sector %llu\n", page_nr, on_disk_sector);
}
@ -1332,9 +1332,9 @@ static unsigned long bm_find_next(struct drbd_device *device,
struct drbd_bitmap *b = device->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
if (!expect(b))
if (!expect(device, b))
return i;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return i;
spin_lock_irq(&b->bm_lock);
@ -1436,9 +1436,9 @@ static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
struct drbd_bitmap *b = device->bitmap;
int c = 0;
if (!expect(b))
if (!expect(device, b))
return 1;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@ -1582,9 +1582,9 @@ int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
unsigned long *p_addr;
int i;
if (!expect(b))
if (!expect(device, b))
return 0;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@ -1619,9 +1619,9 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
if (!expect(b))
if (!expect(device, b))
return 1;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 1;
spin_lock_irqsave(&b->bm_lock, flags);
@ -1635,7 +1635,7 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
if (expect(bitnr < b->bm_bits))
if (expect(device, bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
else
drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
@ -1668,9 +1668,9 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
if (!expect(b))
if (!expect(device, b))
return 0;
if (!expect(b->bm_pages))
if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "drbd debugfs: " fmt
#include <linux/kernel.h>
#include <linux/module.h>

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/debugfs.h>

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SPDX-License-Identifier: GPL-2.0-only */
/*
drbd_int.h
@ -37,6 +37,7 @@
#include "drbd_strings.h"
#include "drbd_state.h"
#include "drbd_protocol.h"
#include "drbd_polymorph_printk.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@ -75,71 +76,6 @@ extern int drbd_proc_details;
struct drbd_device;
struct drbd_connection;
#define __drbd_printk_device(level, device, fmt, args...) \
dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
#define __drbd_printk_resource(level, resource, fmt, args...) \
printk(level "drbd %s: " fmt, (resource)->name, ## args)
#define __drbd_printk_connection(level, connection, fmt, args...) \
printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
void drbd_printk_with_wrong_object_type(void);
#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
(__builtin_types_compatible_p(typeof(obj), type) || \
__builtin_types_compatible_p(typeof(obj), const type)), \
func(level, (const type)(obj), fmt, ## args)
#define drbd_printk(level, obj, fmt, args...) \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, struct drbd_device *, \
__drbd_printk_device, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, struct drbd_resource *, \
__drbd_printk_resource, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, struct drbd_connection *, \
__drbd_printk_connection, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
__drbd_printk_peer_device, level, fmt, ## args), \
drbd_printk_with_wrong_object_type()))))
#define drbd_dbg(obj, fmt, args...) \
drbd_printk(KERN_DEBUG, obj, fmt, ## args)
#define drbd_alert(obj, fmt, args...) \
drbd_printk(KERN_ALERT, obj, fmt, ## args)
#define drbd_err(obj, fmt, args...) \
drbd_printk(KERN_ERR, obj, fmt, ## args)
#define drbd_warn(obj, fmt, args...) \
drbd_printk(KERN_WARNING, obj, fmt, ## args)
#define drbd_info(obj, fmt, args...) \
drbd_printk(KERN_INFO, obj, fmt, ## args)
#define drbd_emerg(obj, fmt, args...) \
drbd_printk(KERN_EMERG, obj, fmt, ## args)
#define dynamic_drbd_dbg(device, fmt, args...) \
dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
#define D_ASSERT(device, exp) do { \
if (!(exp)) \
drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
} while (0)
/**
* expect - Make an assertion
*
* Unlike the assert macro, this macro returns a boolean result.
*/
#define expect(exp) ({ \
bool _bool = (exp); \
if (!_bool) \
drbd_err(device, "ASSERTION %s FAILED in %s\n", \
#exp, __func__); \
_bool; \
})
/* Defines to control fault insertion */
enum {
DRBD_FAULT_MD_WR = 0, /* meta data write */
@ -395,6 +331,7 @@ struct drbd_peer_request {
struct drbd_peer_device *peer_device;
struct drbd_epoch *epoch; /* for writes */
struct page *pages;
blk_opf_t opf;
atomic_t pending_bios;
struct drbd_interval i;
/* see comments on ee flag bits below */
@ -406,6 +343,10 @@ struct drbd_peer_request {
};
};
/* Equivalent to bio_op and req_op. */
#define peer_req_op(peer_req) \
((peer_req)->opf & REQ_OP_MASK)
/* ee flag bits.
* While corresponding bios are in flight, the only modification will be
* set_bit WAS_ERROR, which has to be atomic.
@ -1545,8 +1486,7 @@ extern void drbd_send_acks_wf(struct work_struct *ws);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
bool throttle_if_app_is_waiting);
extern int drbd_submit_peer_request(struct drbd_device *,
struct drbd_peer_request *, blk_opf_t, int);
extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req);
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
@ -1718,7 +1658,7 @@ static inline void __drbd_chk_io_error_(struct drbd_device *device,
switch (ep) {
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Local IO failed in %s.\n", where);
if (device->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
// SPDX-License-Identifier: GPL-2.0-only
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd.c
@ -1259,7 +1259,7 @@ static int _drbd_send_bitmap(struct drbd_device *device)
struct bm_xfer_ctx c;
int err;
if (!expect(device->bitmap))
if (!expect(device, device->bitmap))
return false;
if (get_ldev(device)) {
@ -2217,7 +2217,8 @@ void drbd_destroy_device(struct kref *kref)
kref_put(&peer_device->connection->kref, drbd_destroy_connection);
kfree(peer_device);
}
memset(device, 0xfd, sizeof(*device));
if (device->submit.wq)
destroy_workqueue(device->submit.wq);
kfree(device);
kref_put(&resource->kref, drbd_destroy_resource);
}
@ -2249,9 +2250,9 @@ static void do_retry(struct work_struct *ws)
bool expected;
expected =
expect(atomic_read(&req->completion_ref) == 0) &&
expect(req->rq_state & RQ_POSTPONED) &&
expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
expect(device, atomic_read(&req->completion_ref) == 0) &&
expect(device, req->rq_state & RQ_POSTPONED) &&
expect(device, (req->rq_state & RQ_LOCAL_PENDING) == 0 ||
(req->rq_state & RQ_LOCAL_ABORTED) != 0);
if (!expected)
@ -2309,7 +2310,6 @@ void drbd_destroy_resource(struct kref *kref)
idr_destroy(&resource->devices);
free_cpumask_var(resource->cpu_mask);
kfree(resource->name);
memset(resource, 0xf2, sizeof(*resource));
kfree(resource);
}
@ -2650,7 +2650,6 @@ void drbd_destroy_connection(struct kref *kref)
drbd_free_socket(&connection->data);
kfree(connection->int_dig_in);
kfree(connection->int_dig_vv);
memset(connection, 0xfc, sizeof(*connection));
kfree(connection);
kref_put(&resource->kref, drbd_destroy_resource);
}
@ -2774,7 +2773,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
err = add_disk(disk);
if (err)
goto out_idr_remove_from_resource;
goto out_destroy_workqueue;
/* inherit the connection state */
device->state.conn = first_connection(resource)->cstate;
@ -2788,6 +2787,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_debugfs_device_add(device);
return NO_ERROR;
out_destroy_workqueue:
destroy_workqueue(device->submit.wq);
out_idr_remove_from_resource:
for_each_connection_safe(connection, n, resource) {
peer_device = idr_remove(&connection->peer_devices, vnr);
@ -3766,7 +3767,7 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type)
if (ret) {
drbd_fault_count++;
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_warn(device, "***Simulating %s failure\n",
_drbd_fault_str(type));
}

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_nl.c
@ -1210,6 +1210,7 @@ static void decide_on_discard_support(struct drbd_device *device,
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue *q = device->rq_queue;
unsigned int max_discard_sectors;
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
goto not_supported;
@ -1230,15 +1231,14 @@ static void decide_on_discard_support(struct drbd_device *device,
* topology on all peers.
*/
blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
q->limits.max_write_zeroes_sectors =
drbd_max_discard_sectors(connection);
max_discard_sectors = drbd_max_discard_sectors(connection);
blk_queue_max_discard_sectors(q, max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
return;
not_supported:
blk_queue_discard_granularity(q, 0);
q->limits.max_discard_sectors = 0;
q->limits.max_write_zeroes_sectors = 0;
blk_queue_max_discard_sectors(q, 0);
}
static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
@ -1256,6 +1256,18 @@ static void fixup_write_zeroes(struct drbd_device *device, struct request_queue
q->limits.max_write_zeroes_sectors = 0;
}
static void fixup_discard_support(struct drbd_device *device, struct request_queue *q)
{
unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
unsigned int discard_granularity =
device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
if (discard_granularity > max_discard) {
blk_queue_discard_granularity(q, 0);
blk_queue_max_discard_sectors(q, 0);
}
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size, struct o_qlim *o)
{
@ -1288,6 +1300,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
disk_update_readahead(device->vdisk);
}
fixup_write_zeroes(device, q);
fixup_discard_support(device, q);
}
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
@ -1530,7 +1543,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
goto fail_unlock;
}
if (!expect(new_disk_conf->resync_rate >= 1))
if (!expect(device, new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
sanitize_disk_conf(device, new_disk_conf, device->ldev);

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H

View File

@ -0,0 +1,141 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_POLYMORPH_PRINTK_H
#define DRBD_POLYMORPH_PRINTK_H
#if !defined(CONFIG_DYNAMIC_DEBUG)
#undef DEFINE_DYNAMIC_DEBUG_METADATA
#undef __dynamic_pr_debug
#undef DYNAMIC_DEBUG_BRANCH
#define DEFINE_DYNAMIC_DEBUG_METADATA(D, F) const char *D = F; ((void)D)
#define __dynamic_pr_debug(D, F, args...) do { (void)(D); if (0) printk(F, ## args); } while (0)
#define DYNAMIC_DEBUG_BRANCH(D) false
#endif
#define __drbd_printk_drbd_device_prep(device) \
const struct drbd_device *__d = (device); \
const struct drbd_resource *__r = __d->resource
#define __drbd_printk_drbd_device_fmt(fmt) "drbd %s/%u drbd%u: " fmt
#define __drbd_printk_drbd_device_args() __r->name, __d->vnr, __d->minor
#define __drbd_printk_drbd_device_unprep()
#define __drbd_printk_drbd_peer_device_prep(peer_device) \
const struct drbd_device *__d; \
const struct drbd_resource *__r; \
__d = (peer_device)->device; \
__r = __d->resource
#define __drbd_printk_drbd_peer_device_fmt(fmt) \
"drbd %s/%u drbd%u: " fmt
#define __drbd_printk_drbd_peer_device_args() \
__r->name, __d->vnr, __d->minor
#define __drbd_printk_drbd_peer_device_unprep()
#define __drbd_printk_drbd_resource_prep(resource) \
const struct drbd_resource *__r = resource
#define __drbd_printk_drbd_resource_fmt(fmt) "drbd %s: " fmt
#define __drbd_printk_drbd_resource_args() __r->name
#define __drbd_printk_drbd_resource_unprep(resource)
#define __drbd_printk_drbd_connection_prep(connection) \
const struct drbd_connection *__c = (connection); \
const struct drbd_resource *__r = __c->resource
#define __drbd_printk_drbd_connection_fmt(fmt) \
"drbd %s: " fmt
#define __drbd_printk_drbd_connection_args() \
__r->name
#define __drbd_printk_drbd_connection_unprep()
void drbd_printk_with_wrong_object_type(void);
void drbd_dyn_dbg_with_wrong_object_type(void);
#define __drbd_printk_choose_cond(obj, struct_name) \
(__builtin_types_compatible_p(typeof(obj), struct struct_name *) || \
__builtin_types_compatible_p(typeof(obj), const struct struct_name *))
#define __drbd_printk_if_same_type(obj, struct_name, level, fmt, args...) \
__drbd_printk_choose_cond(obj, struct_name), \
({ \
__drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
printk(level __drbd_printk_ ## struct_name ## _fmt(fmt), \
__drbd_printk_ ## struct_name ## _args(), ## args); \
__drbd_printk_ ## struct_name ## _unprep(); \
})
#define drbd_printk(level, obj, fmt, args...) \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, drbd_device, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, drbd_resource, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, drbd_connection, level, fmt, ## args), \
__builtin_choose_expr( \
__drbd_printk_if_same_type(obj, drbd_peer_device, level, fmt, ## args), \
drbd_printk_with_wrong_object_type()))))
#define __drbd_dyn_dbg_if_same_type(obj, struct_name, fmt, args...) \
__drbd_printk_choose_cond(obj, struct_name), \
({ \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
if (DYNAMIC_DEBUG_BRANCH(descriptor)) { \
__drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
__dynamic_pr_debug(&descriptor, __drbd_printk_ ## struct_name ## _fmt(fmt), \
__drbd_printk_ ## struct_name ## _args(), ## args); \
__drbd_printk_ ## struct_name ## _unprep(); \
} \
})
#define dynamic_drbd_dbg(obj, fmt, args...) \
__builtin_choose_expr( \
__drbd_dyn_dbg_if_same_type(obj, drbd_device, fmt, ## args), \
__builtin_choose_expr( \
__drbd_dyn_dbg_if_same_type(obj, drbd_resource, fmt, ## args), \
__builtin_choose_expr( \
__drbd_dyn_dbg_if_same_type(obj, drbd_connection, fmt, ## args), \
__builtin_choose_expr( \
__drbd_dyn_dbg_if_same_type(obj, drbd_peer_device, fmt, ## args), \
drbd_dyn_dbg_with_wrong_object_type()))))
#define drbd_emerg(device, fmt, args...) \
drbd_printk(KERN_EMERG, device, fmt, ## args)
#define drbd_alert(device, fmt, args...) \
drbd_printk(KERN_ALERT, device, fmt, ## args)
#define drbd_crit(device, fmt, args...) \
drbd_printk(KERN_CRIT, device, fmt, ## args)
#define drbd_err(device, fmt, args...) \
drbd_printk(KERN_ERR, device, fmt, ## args)
#define drbd_warn(device, fmt, args...) \
drbd_printk(KERN_WARNING, device, fmt, ## args)
#define drbd_notice(device, fmt, args...) \
drbd_printk(KERN_NOTICE, device, fmt, ## args)
#define drbd_info(device, fmt, args...) \
drbd_printk(KERN_INFO, device, fmt, ## args)
#define drbd_ratelimit() \
({ \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
__ratelimit(&_rs); \
})
#define D_ASSERT(x, exp) \
do { \
if (!(exp)) \
drbd_err(x, "ASSERTION %s FAILED in %s\n", \
#exp, __func__); \
} while (0)
/**
* expect - Make an assertion
*
* Unlike the assert macro, this macro returns a boolean result.
*/
#define expect(x, exp) ({ \
bool _bool = (exp); \
if (!_bool && drbd_ratelimit()) \
drbd_err(x, "ASSERTION %s FAILED in %s\n", \
#exp, __func__); \
_bool; \
})
#endif

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_proc.c

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_PROTOCOL_H
#define __DRBD_PROTOCOL_H

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_receiver.c
@ -413,7 +413,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
drbd_free_pages(device, peer_req->pages, is_net);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
drbd_al_complete_io(device, &peer_req->i);
}
@ -1603,9 +1603,19 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
drbd_endio_write_sec_final(peer_req);
}
static int peer_request_fault_type(struct drbd_peer_request *peer_req)
{
if (peer_req_op(peer_req) == REQ_OP_READ) {
return peer_req->flags & EE_APPLICATION ?
DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
} else {
return peer_req->flags & EE_APPLICATION ?
DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
}
}
/**
* drbd_submit_peer_request()
* @device: DRBD device.
* @peer_req: peer request
*
* May spread the pages to multiple bios,
@ -1619,10 +1629,9 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
* on certain Xen deployments.
*/
/* TODO allocate from our own bio_set. */
int drbd_submit_peer_request(struct drbd_device *device,
struct drbd_peer_request *peer_req,
const blk_opf_t opf, const int fault_type)
int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
{
struct drbd_device *device = peer_req->peer_device->device;
struct bio *bios = NULL;
struct bio *bio;
struct page *page = peer_req->pages;
@ -1667,7 +1676,18 @@ int drbd_submit_peer_request(struct drbd_device *device,
* generated bio, but a bio allocated on behalf of the peer.
*/
next_bio:
bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO);
/* _DISCARD, _WRITE_ZEROES handled above.
* REQ_OP_FLUSH (empty flush) not expected,
* should have been mapped to a "drbd protocol barrier".
* REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
*/
if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
peer_req_op(peer_req) == REQ_OP_READ)) {
drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
return -EINVAL;
}
bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
/* > peer_req->i.sector, unless this is the first bio */
bio->bi_iter.bi_sector = sector;
bio->bi_private = peer_req;
@ -1697,7 +1717,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
bios = bios->bi_next;
bio->bi_next = NULL;
drbd_submit_bio_noacct(device, fault_type, bio);
drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
} while (bios);
return 0;
}
@ -1853,21 +1873,21 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
/* assume request_size == data_size, but special case trim. */
ds = data_size;
if (trim) {
if (!expect(data_size == 0))
if (!expect(peer_device, data_size == 0))
return NULL;
ds = be32_to_cpu(trim->size);
} else if (zeroes) {
if (!expect(data_size == 0))
if (!expect(peer_device, data_size == 0))
return NULL;
ds = be32_to_cpu(zeroes->size);
}
if (!expect(IS_ALIGNED(ds, 512)))
if (!expect(peer_device, IS_ALIGNED(ds, 512)))
return NULL;
if (trim || zeroes) {
if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
return NULL;
} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
} else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
@ -2051,6 +2071,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
* respective _drbd_clear_done_ee */
peer_req->w.cb = e_end_resync_block;
peer_req->opf = REQ_OP_WRITE;
peer_req->submit_jif = jiffies;
spin_lock_irq(&device->resource->req_lock);
@ -2058,8 +2079,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE,
DRBD_FAULT_RS_WR) == 0)
if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* don't care for the reason here */
@ -2145,7 +2165,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
* or in drbd_peer_request_endio. */
err = recv_resync_read(peer_device, sector, pi);
} else {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Can not write resync data to local disk.\n");
err = drbd_drain_block(peer_device, pi->size);
@ -2375,16 +2395,6 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
return ret;
}
/* see also bio_flags_to_wire()
* DRBD_REQ_*, because we need to semantically map the flags to data packet
* flags and back. We may replicate to other kernel versions. */
static blk_opf_t wire_flags_to_bio_flags(u32 dpf)
{
return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
(dpf & DP_FUA ? REQ_FUA : 0) |
(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
}
static enum req_op wire_flags_to_bio_op(u32 dpf)
{
if (dpf & DP_ZEROES)
@ -2395,6 +2405,15 @@ static enum req_op wire_flags_to_bio_op(u32 dpf)
return REQ_OP_WRITE;
}
/* see also bio_flags_to_wire() */
static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
{
return wire_flags_to_bio_op(dpf) |
(dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
(dpf & DP_FUA ? REQ_FUA : 0) |
(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
}
static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
unsigned int size)
{
@ -2538,8 +2557,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
struct drbd_peer_request *peer_req;
struct p_data *p = pi->data;
u32 peer_seq = be32_to_cpu(p->seq_num);
enum req_op op;
blk_opf_t op_flags;
u32 dp_flags;
int err, tp;
@ -2578,11 +2595,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_APPLICATION;
dp_flags = be32_to_cpu(p->dp_flags);
op = wire_flags_to_bio_op(dp_flags);
op_flags = wire_flags_to_bio_flags(dp_flags);
peer_req->opf = wire_flags_to_bio(connection, dp_flags);
if (pi->cmd == P_TRIM) {
D_ASSERT(peer_device, peer_req->i.size > 0);
D_ASSERT(peer_device, op == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req->pages == NULL);
/* need to play safe: an older DRBD sender
* may mean zero-out while sending P_TRIM. */
@ -2590,7 +2606,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_ZEROOUT;
} else if (pi->cmd == P_ZEROES) {
D_ASSERT(peer_device, peer_req->i.size > 0);
D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
D_ASSERT(peer_device, peer_req->pages == NULL);
/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
if (dp_flags & DP_DISCARD)
@ -2677,8 +2693,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
}
err = drbd_submit_peer_request(device, peer_req, op | op_flags,
DRBD_FAULT_DT_WR);
err = drbd_submit_peer_request(peer_req);
if (!err)
return 0;
@ -2789,7 +2804,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
struct drbd_peer_request *peer_req;
struct digest_info *di = NULL;
int size, verb;
unsigned int fault_type;
struct p_block_req *p = pi->data;
peer_device = conn_peer_device(connection, pi->vnr);
@ -2832,7 +2846,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
default:
BUG();
}
if (verb && __ratelimit(&drbd_ratelimit_state))
if (verb && drbd_ratelimit())
drbd_err(device, "Can not satisfy peer's read request, "
"no local data.\n");
@ -2849,11 +2863,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
put_ldev(device);
return -ENOMEM;
}
peer_req->opf = REQ_OP_READ;
switch (pi->cmd) {
case P_DATA_REQUEST:
peer_req->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
peer_req->flags |= EE_APPLICATION;
goto submit;
@ -2867,14 +2881,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
fallthrough;
case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
/* used in the sector offset progress display */
device->bm_resync_fo = BM_SECT_TO_BIT(sector);
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
if (!di)
goto out_free_e;
@ -2923,7 +2935,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
(unsigned long long)sector);
}
peer_req->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
break;
default:
@ -2975,8 +2986,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
submit:
update_receiver_timing_details(connection, drbd_submit_peer_request);
inc_unacked(device);
if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
fault_type) == 0)
if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* don't care for the reason here */
@ -4947,7 +4957,6 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
const enum req_op op = REQ_OP_WRITE_ZEROES;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, 0, GFP_NOIO);
@ -4957,6 +4966,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
}
peer_req->w.cb = e_end_resync_block;
peer_req->opf = REQ_OP_DISCARD;
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_TRIM;
@ -4965,8 +4975,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
err = drbd_submit_peer_request(device, peer_req, op,
DRBD_FAULT_RS_WR);
err = drbd_submit_peer_request(peer_req);
if (err) {
spin_lock_irq(&device->resource->req_lock);

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_req.c
@ -144,7 +144,7 @@ void drbd_req_destroy(struct kref *kref)
if (get_ldev_if_state(device, D_FAILED)) {
drbd_al_complete_io(device, &req->i);
put_ldev(device);
} else if (__ratelimit(&drbd_ratelimit_state)) {
} else if (drbd_ratelimit()) {
drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
"but my Disk seems to have failed :(\n",
(unsigned long long) req->i.sector, req->i.size);
@ -518,7 +518,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
{
if (!__ratelimit(&drbd_ratelimit_state))
if (!drbd_ratelimit())
return;
drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n",
@ -1402,7 +1402,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
submit_private_bio = true;
} else if (no_remote) {
nodata:
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
(unsigned long long)req->i.sector, req->i.size >> 9);
/* A write may have been queued for send_oos, however.

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SPDX-License-Identifier: GPL-2.0-only */
/*
drbd_req.h

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_state.c

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_STATE_H
#define DRBD_STATE_H

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_STATE_CHANGE_H
#define DRBD_STATE_CHANGE_H

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd.h

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_STRINGS_H
#define __DRBD_STRINGS_H

View File

@ -1,4 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SPDX-License-Identifier: GPL-2.0-only */
/*
-*- linux-c -*-
drbd_receiver.c

View File

@ -1,4 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_worker.c
@ -176,7 +176,7 @@ void drbd_peer_request_endio(struct bio *bio)
bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
bio_op(bio) == REQ_OP_DISCARD;
if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
if (bio->bi_status && drbd_ratelimit())
drbd_warn(device, "%s: error=%d s=%llus\n",
is_write ? (is_discard ? "discard" : "write")
: "read", bio->bi_status,
@ -240,7 +240,7 @@ void drbd_request_endio(struct bio *bio)
* though we still will complain noisily about it.
*/
if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
if (!bio->bi_status)
@ -400,13 +400,13 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
goto defer;
peer_req->w.cb = w_e_send_csum;
peer_req->opf = REQ_OP_READ;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->read_ee);
spin_unlock_irq(&device->resource->req_lock);
atomic_add(size >> 9, &device->rs_sect_ev);
if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
DRBD_FAULT_RS_RD) == 0)
if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
@ -1062,7 +1062,7 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. sector=%llus.\n",
(unsigned long long)peer_req->i.sector);
@ -1135,13 +1135,13 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
else
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Not sending RSDataReply, "
"partner DISKLESS!\n");
err = 0;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
(unsigned long long)peer_req->i.sector);
@ -1212,7 +1212,7 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
}
} else {
err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
if (__ratelimit(&drbd_ratelimit_state))
if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}

View File

@ -4593,8 +4593,10 @@ static int __init do_floppy_init(void)
goto out_put_disk;
err = floppy_alloc_disk(drive, 0);
if (err)
if (err) {
blk_mq_free_tag_set(&tag_sets[drive]);
goto out_put_disk;
}
timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
}

View File

@ -523,6 +523,24 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item,
}
CONFIGFS_ATTR(nullb_device_, badblocks);
static ssize_t nullb_device_zone_readonly_store(struct config_item *item,
const char *page, size_t count)
{
struct nullb_device *dev = to_nullb_device(item);
return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_readonly);
static ssize_t nullb_device_zone_offline_store(struct config_item *item,
const char *page, size_t count)
{
struct nullb_device *dev = to_nullb_device(item);
return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_offline);
static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_size,
&nullb_device_attr_completion_nsec,
@ -549,6 +567,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
&nullb_device_attr_zone_readonly,
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tag_bitmap,
@ -614,7 +634,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
"zone_capacity,zone_max_active,zone_max_open,"
"zone_nr_conv,zone_size\n");
"zone_nr_conv,zone_offline,zone_readonly,zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);

View File

@ -151,6 +151,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
sector_t sector, unsigned int len);
ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond);
#else
static inline int null_init_zoned_dev(struct nullb_device *dev,
struct request_queue *q)
@ -174,6 +176,12 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb,
{
return len;
}
static inline ssize_t zone_cond_store(struct nullb_device *dev,
const char *page, size_t count,
enum blk_zone_cond cond)
{
return -EOPNOTSUPP;
}
#define null_report_zones NULL
#endif /* CONFIG_BLK_DEV_ZONED */
#endif /* __NULL_BLK_H */

View File

@ -384,8 +384,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
null_lock_zone(dev, zone);
if (zone->cond == BLK_ZONE_COND_FULL) {
/* Cannot write to a full zone */
if (zone->cond == BLK_ZONE_COND_FULL ||
zone->cond == BLK_ZONE_COND_READONLY ||
zone->cond == BLK_ZONE_COND_OFFLINE) {
/* Cannot write to the zone */
ret = BLK_STS_IOERR;
goto unlock;
}
@ -613,7 +615,9 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
zone = &dev->zones[i];
null_lock_zone(dev, zone);
if (zone->cond != BLK_ZONE_COND_EMPTY) {
if (zone->cond != BLK_ZONE_COND_EMPTY &&
zone->cond != BLK_ZONE_COND_READONLY &&
zone->cond != BLK_ZONE_COND_OFFLINE) {
null_reset_zone(dev, zone);
trace_nullb_zone_op(cmd, i, zone->cond);
}
@ -627,6 +631,12 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
null_lock_zone(dev, zone);
if (zone->cond == BLK_ZONE_COND_READONLY ||
zone->cond == BLK_ZONE_COND_OFFLINE) {
ret = BLK_STS_IOERR;
goto unlock;
}
switch (op) {
case REQ_OP_ZONE_RESET:
ret = null_reset_zone(dev, zone);
@ -648,6 +658,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
if (ret == BLK_STS_OK)
trace_nullb_zone_op(cmd, zone_no, zone->cond);
unlock:
null_unlock_zone(dev, zone);
return ret;
@ -674,6 +685,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
default:
dev = cmd->nq->dev;
zone = &dev->zones[null_zone_no(dev, sector)];
if (zone->cond == BLK_ZONE_COND_OFFLINE)
return BLK_STS_IOERR;
null_lock_zone(dev, zone);
sts = null_process_cmd(cmd, op, sector, nr_sectors);
@ -681,3 +694,79 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
return sts;
}
}
/*
* Set a zone in the read-only or offline condition.
*/
static void null_set_zone_cond(struct nullb_device *dev,
struct nullb_zone *zone, enum blk_zone_cond cond)
{
if (WARN_ON_ONCE(cond != BLK_ZONE_COND_READONLY &&
cond != BLK_ZONE_COND_OFFLINE))
return;
null_lock_zone(dev, zone);
/*
* If the read-only condition is requested again to zones already in
* read-only condition, restore back normal empty condition. Do the same
* if the offline condition is requested for offline zones. Otherwise,
* set the specified zone condition to the zones. Finish the zones
* beforehand to free up zone resources.
*/
if (zone->cond == cond) {
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
if (dev->memory_backed)
null_handle_discard(dev, zone->start, zone->len);
} else {
if (zone->cond != BLK_ZONE_COND_READONLY &&
zone->cond != BLK_ZONE_COND_OFFLINE)
null_finish_zone(dev, zone);
zone->cond = cond;
zone->wp = (sector_t)-1;
}
null_unlock_zone(dev, zone);
}
/*
* Identify a zone from the sector written to configfs file. Then set zone
* condition to the zone.
*/
ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond)
{
unsigned long long sector;
unsigned int zone_no;
int ret;
if (!dev->zoned) {
pr_err("null_blk device is not zoned\n");
return -EINVAL;
}
if (!dev->zones) {
pr_err("null_blk device is not yet powered\n");
return -EINVAL;
}
ret = kstrtoull(page, 0, &sector);
if (ret < 0)
return ret;
zone_no = null_zone_no(dev, sector);
if (zone_no >= dev->nr_zones) {
pr_err("Sector out of range\n");
return -EINVAL;
}
if (dev->zones[zone_no].type == BLK_ZONE_TYPE_CONVENTIONAL) {
pr_err("Can not change condition of conventional zones\n");
return -EINVAL;
}
null_set_zone_cond(dev, &dev->zones[zone_no], cond);
return count;
}

File diff suppressed because it is too large Load Diff

View File

@ -512,7 +512,7 @@ static void virtblk_free_disk(struct gendisk *disk)
{
struct virtio_blk *vblk = disk->private_data;
ida_simple_remove(&vd_index_ida, vblk->index);
ida_free(&vd_index_ida, vblk->index);
mutex_destroy(&vblk->vdev_mutex);
kfree(vblk);
}
@ -902,8 +902,8 @@ static int virtblk_probe(struct virtio_device *vdev)
return -EINVAL;
}
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
GFP_KERNEL);
err = ida_alloc_range(&vd_index_ida, 0,
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
@ -1163,7 +1163,7 @@ static int virtblk_probe(struct virtio_device *vdev)
out_free_vblk:
kfree(vblk);
out_free_index:
ida_simple_remove(&vd_index_ida, index);
ida_free(&vd_index_ida, index);
out:
return err;
}

View File

@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info)
if (info->rq && info->gd) {
blk_mq_stop_hw_queues(info->rq);
blk_mark_disk_dead(info->gd);
set_capacity(info->gd, 0);
}
for_each_rinfo(info, rinfo, i) {

View File

@ -160,7 +160,7 @@ static void read_moving(struct cache_set *c)
moving_init(io);
bio = &io->bio.bio;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_opf = REQ_OP_READ;
bio->bi_end_io = read_moving_endio;
if (bch_bio_alloc_pages(bio, GFP_KERNEL))

View File

@ -244,7 +244,7 @@ static void bch_data_insert_start(struct closure *cl)
trace_bcache_cache_insert(k);
bch_keylist_push(&op->insert_keys);
bio_set_op_attrs(n, REQ_OP_WRITE, 0);
n->bi_opf = REQ_OP_WRITE;
bch_submit_bbio(n, op->c, k, 0);
} while (n != bio);

View File

@ -434,7 +434,7 @@ static void write_dirty(struct closure *cl)
*/
if (KEY_DIRTY(&w->key)) {
dirty_init(w);
bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
io->bio.bi_opf = REQ_OP_WRITE;
io->bio.bi_iter.bi_sector = KEY_START(&w->key);
bio_set_dev(&io->bio, io->dc->bdev);
io->bio.bi_end_io = dirty_endio;
@ -547,7 +547,7 @@ static void read_dirty(struct cached_dev *dc)
io->sequence = sequence++;
dirty_init(w);
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
io->bio.bi_opf = REQ_OP_READ;
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
io->bio.bi_end_io = read_dirty_endio;

View File

@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
struct dm_keyslot_evict_args *args = data;
int err;
err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key);
err = blk_crypto_evict_key(dev->bdev, args->key);
if (!args->err)
args->err = err;
/* Always try to evict the key from all devices. */

View File

@ -410,7 +410,7 @@ static void end_discard(struct discard_op *op, int r)
* need to wait for the chain to complete.
*/
bio_chain(op->bio, op->parent_bio);
bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
op->bio->bi_opf = REQ_OP_DISCARD;
submit_bio(op->bio);
}

View File

@ -732,28 +732,48 @@ static char *_dm_claim_ptr = "I belong to device-mapper";
/*
* Open a table device so we can use it as a map destination.
*/
static int open_table_device(struct table_device *td, dev_t dev,
struct mapped_device *md)
static struct table_device *open_table_device(struct mapped_device *md,
dev_t dev, fmode_t mode)
{
struct table_device *td;
struct block_device *bdev;
u64 part_off;
int r;
BUG_ON(td->dm_dev.bdev);
td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
if (!td)
return ERR_PTR(-ENOMEM);
refcount_set(&td->count, 1);
bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
r = bd_link_disk_holder(bdev, dm_disk(md));
if (r) {
blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
return r;
bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
if (IS_ERR(bdev)) {
r = PTR_ERR(bdev);
goto out_free_td;
}
/*
* We can be called before the dm disk is added. In that case we can't
* register the holder relation here. It will be done once add_disk was
* called.
*/
if (md->disk->slave_dir) {
r = bd_link_disk_holder(bdev, md->disk);
if (r)
goto out_blkdev_put;
}
td->dm_dev.mode = mode;
td->dm_dev.bdev = bdev;
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
return 0;
format_dev_t(td->dm_dev.name, dev);
list_add(&td->list, &md->table_devices);
return td;
out_blkdev_put:
blkdev_put(bdev, mode | FMODE_EXCL);
out_free_td:
kfree(td);
return ERR_PTR(r);
}
/*
@ -761,14 +781,12 @@ static int open_table_device(struct table_device *td, dev_t dev,
*/
static void close_table_device(struct table_device *td, struct mapped_device *md)
{
if (!td->dm_dev.bdev)
return;
bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
if (md->disk->slave_dir)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
put_dax(td->dm_dev.dax_dev);
td->dm_dev.bdev = NULL;
td->dm_dev.dax_dev = NULL;
list_del(&td->list);
kfree(td);
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@ -786,31 +804,16 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
struct dm_dev **result)
{
int r;
struct table_device *td;
mutex_lock(&md->table_devices_lock);
td = find_table_device(&md->table_devices, dev, mode);
if (!td) {
td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
if (!td) {
td = open_table_device(md, dev, mode);
if (IS_ERR(td)) {
mutex_unlock(&md->table_devices_lock);
return -ENOMEM;
return PTR_ERR(td);
}
td->dm_dev.mode = mode;
td->dm_dev.bdev = NULL;
if ((r = open_table_device(td, dev, md))) {
mutex_unlock(&md->table_devices_lock);
kfree(td);
return r;
}
format_dev_t(td->dm_dev.name, dev);
refcount_set(&td->count, 1);
list_add(&td->list, &md->table_devices);
} else {
refcount_inc(&td->count);
}
@ -825,27 +828,11 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
struct table_device *td = container_of(d, struct table_device, dm_dev);
mutex_lock(&md->table_devices_lock);
if (refcount_dec_and_test(&td->count)) {
if (refcount_dec_and_test(&td->count))
close_table_device(td, md);
list_del(&td->list);
kfree(td);
}
mutex_unlock(&md->table_devices_lock);
}
static void free_table_devices(struct list_head *devices)
{
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, devices) {
struct table_device *td = list_entry(tmp, struct table_device, list);
DMWARN("dm_destroy: %s still exists with %d references",
td->dm_dev.name, refcount_read(&td->count));
kfree(td);
}
}
/*
* Get the geometry associated with a dm device
*/
@ -1972,8 +1959,21 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
if (dm_get_md_type(md) != DM_TYPE_NONE) {
struct table_device *td;
dm_sysfs_exit(md);
list_for_each_entry(td, &md->table_devices, list) {
bd_unlink_disk_holder(td->dm_dev.bdev,
md->disk);
}
/*
* Hold lock to make sure del_gendisk() won't concurrent
* with open/close_table_device().
*/
mutex_lock(&md->table_devices_lock);
del_gendisk(md->disk);
mutex_unlock(&md->table_devices_lock);
}
dm_queue_destroy_crypto_profile(md->queue);
put_disk(md->disk);
@ -2122,7 +2122,7 @@ static void free_dev(struct mapped_device *md)
cleanup_mapped_device(md);
free_table_devices(&md->table_devices);
WARN_ON_ONCE(!list_empty(&md->table_devices));
dm_stats_cleanup(&md->stats);
free_minor(minor);
@ -2305,6 +2305,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
enum dm_queue_mode type = dm_table_get_type(t);
struct queue_limits limits;
struct table_device *td;
int r;
switch (type) {
@ -2333,17 +2334,40 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
if (r)
return r;
/*
* Hold lock to make sure add_disk() and del_gendisk() won't concurrent
* with open_table_device() and close_table_device().
*/
mutex_lock(&md->table_devices_lock);
r = add_disk(md->disk);
mutex_unlock(&md->table_devices_lock);
if (r)
return r;
r = dm_sysfs_init(md);
if (r) {
del_gendisk(md->disk);
return r;
/*
* Register the holder relationship for devices added before the disk
* was live.
*/
list_for_each_entry(td, &md->table_devices, list) {
r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
if (r)
goto out_undo_holders;
}
r = dm_sysfs_init(md);
if (r)
goto out_undo_holders;
md->type = type;
return 0;
out_undo_holders:
list_for_each_entry_continue_reverse(td, &md->table_devices, list)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
mutex_lock(&md->table_devices_lock);
del_gendisk(md->disk);
mutex_unlock(&md->table_devices_lock);
return r;
}
struct mapped_device *dm_get_md(dev_t dev)

View File

@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
sb = kmap_atomic(bitmap->storage.sb_page);
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
pr_debug(" version: %u\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
le32_to_cpu(*(__le32 *)(sb->uuid+0)),
le32_to_cpu(*(__le32 *)(sb->uuid+4)),
@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug("events cleared: %llu\n",
(unsigned long long) le64_to_cpu(sb->events_cleared));
pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize));
pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
kunmap_atomic(sb);
}
@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bytes = DIV_ROUND_UP(chunks, 8);
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
} while (bytes > (space << 9));
} while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
(BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
} else
chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = pages;
bitmap->counts.chunkshift = chunkshift;
bitmap->counts.chunks = chunks;
bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
BITMAP_BLOCK_SHIFT);
blocks = min(old_counts.chunks << old_counts.chunkshift,
@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = old_counts.pages;
bitmap->counts.chunkshift = old_counts.chunkshift;
bitmap->counts.chunks = old_counts.chunks;
bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
BITMAP_BLOCK_SHIFT);
bitmap->mddev->bitmap_info.chunksize =
1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
blocks = old_counts.chunks << old_counts.chunkshift;
pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
break;
@ -2195,20 +2196,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (set) {
bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
if (*bmc_new == 0) {
/* need to set on-disk bits too. */
sector_t end = block + new_blocks;
sector_t start = block >> chunkshift;
start <<= chunkshift;
while (start < end) {
md_bitmap_file_set_bit(bitmap, block);
start += 1 << chunkshift;
if (bmc_new) {
if (*bmc_new == 0) {
/* need to set on-disk bits too. */
sector_t end = block + new_blocks;
sector_t start = block >> chunkshift;
start <<= chunkshift;
while (start < end) {
md_bitmap_file_set_bit(bitmap, block);
start += 1 << chunkshift;
}
*bmc_new = 2;
md_bitmap_count_page(&bitmap->counts, block, 1);
md_bitmap_set_pending(&bitmap->counts, block);
}
*bmc_new = 2;
md_bitmap_count_page(&bitmap->counts, block, 1);
md_bitmap_set_pending(&bitmap->counts, block);
*bmc_new |= NEEDED_MASK;
}
*bmc_new |= NEEDED_MASK;
if (new_blocks < old_blocks)
old_blocks = new_blocks;
}
@ -2534,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
if (csize < 512 ||
!is_power_of_2(csize))
return -EINVAL;
if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
sizeof(((bitmap_super_t *)0)->chunksize))))
return -EOVERFLOW;
mddev->bitmap_info.chunksize = csize;
return len;
}

View File

@ -93,6 +93,18 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
enum md_ro_state {
MD_RDWR,
MD_RDONLY,
MD_AUTO_READ,
MD_MAX_STATE
};
static bool md_is_rdwr(struct mddev *mddev)
{
return (mddev->ro == MD_RDWR);
}
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@ -444,7 +456,7 @@ static void md_submit_bio(struct bio *bio)
bio = bio_split_to_limits(bio);
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@ -509,13 +521,14 @@ static void md_end_flush(struct bio *bio)
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
bio_put(bio);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
bio_put(bio);
}
static void md_submit_flush_data(struct work_struct *ws);
@ -913,10 +926,12 @@ static void super_written(struct bio *bio)
} else
clear_bit(LastDev, &rdev->flags);
bio_put(bio);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
rdev_dec_pending(rdev, mddev);
bio_put(bio);
}
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@ -2453,7 +2468,22 @@ static void rdev_delayed_delete(struct work_struct *ws)
kobject_put(&rdev->kobj);
}
static void unbind_rdev_from_array(struct md_rdev *rdev)
void md_autodetect_dev(dev_t dev);
static void export_rdev(struct md_rdev *rdev)
{
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
md_rdev_clear(rdev);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
rdev->bdev = NULL;
kobject_put(&rdev->kobj);
}
static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
@ -2476,56 +2506,8 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
INIT_WORK(&rdev->del_work, rdev_delayed_delete);
kobject_get(&rdev->kobj);
queue_work(md_rdev_misc_wq, &rdev->del_work);
}
/*
* prevent the device from being mounted, repartitioned or
* otherwise reused by a RAID array (or any other kernel
* subsystem), by bd_claiming the device.
*/
static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
{
int err = 0;
struct block_device *bdev;
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
shared ? (struct md_rdev *)lock_rdev : rdev);
if (IS_ERR(bdev)) {
pr_warn("md: could not open device unknown-block(%u,%u).\n",
MAJOR(dev), MINOR(dev));
return PTR_ERR(bdev);
}
rdev->bdev = bdev;
return err;
}
static void unlock_rdev(struct md_rdev *rdev)
{
struct block_device *bdev = rdev->bdev;
rdev->bdev = NULL;
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
void md_autodetect_dev(dev_t dev);
static void export_rdev(struct md_rdev *rdev)
{
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
md_rdev_clear(rdev);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
#endif
unlock_rdev(rdev);
kobject_put(&rdev->kobj);
}
void md_kick_rdev_from_array(struct md_rdev *rdev)
{
unbind_rdev_from_array(rdev);
export_rdev(rdev);
}
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
static void export_array(struct mddev *mddev)
{
@ -2639,7 +2621,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
int any_badblocks_changed = 0;
int ret = -1;
if (mddev->ro) {
if (!md_is_rdwr(mddev)) {
if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
return;
@ -3660,9 +3642,10 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
*/
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
{
int err;
static struct md_rdev *claim_rdev; /* just for claiming the bdev */
struct md_rdev *rdev;
sector_t size;
int err;
rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
if (!rdev)
@ -3670,14 +3653,20 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
err = md_rdev_init(rdev);
if (err)
goto abort_free;
goto out_free_rdev;
err = alloc_disk_sb(rdev);
if (err)
goto abort_free;
goto out_clear_rdev;
err = lock_rdev(rdev, newdev, super_format == -2);
if (err)
goto abort_free;
rdev->bdev = blkdev_get_by_dev(newdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
super_format == -2 ? claim_rdev : rdev);
if (IS_ERR(rdev->bdev)) {
pr_warn("md: could not open device unknown-block(%u,%u).\n",
MAJOR(newdev), MINOR(newdev));
err = PTR_ERR(rdev->bdev);
goto out_clear_rdev;
}
kobject_init(&rdev->kobj, &rdev_ktype);
@ -3686,7 +3675,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
rdev->bdev);
err = -EINVAL;
goto abort_free;
goto out_blkdev_put;
}
if (super_format >= 0) {
@ -3696,21 +3685,22 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
rdev->bdev,
super_format, super_minor);
goto abort_free;
goto out_blkdev_put;
}
if (err < 0) {
pr_warn("md: could not read %pg's sb, not importing!\n",
rdev->bdev);
goto abort_free;
goto out_blkdev_put;
}
}
return rdev;
abort_free:
if (rdev->bdev)
unlock_rdev(rdev);
out_blkdev_put:
blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
out_clear_rdev:
md_rdev_clear(rdev);
out_free_rdev:
kfree(rdev);
return ERR_PTR(err);
}
@ -3901,7 +3891,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
goto out_unlock;
}
rv = -EROFS;
if (mddev->ro)
if (!md_is_rdwr(mddev))
goto out_unlock;
/* request to change the personality. Need to ensure:
@ -4107,7 +4097,7 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
if (mddev->pers->check_reshape == NULL)
err = -EBUSY;
else if (mddev->ro)
else if (!md_is_rdwr(mddev))
err = -EROFS;
else {
mddev->new_layout = n;
@ -4216,7 +4206,7 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
if (mddev->pers->check_reshape == NULL)
err = -EBUSY;
else if (mddev->ro)
else if (!md_is_rdwr(mddev))
err = -EROFS;
else {
mddev->new_chunk_sectors = n >> 9;
@ -4339,13 +4329,13 @@ array_state_show(struct mddev *mddev, char *page)
if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
case MD_RDONLY:
st = readonly;
break;
case 2:
case MD_AUTO_READ:
st = read_auto;
break;
case 0:
case MD_RDWR:
spin_lock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending;
@ -4381,7 +4371,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
* clean and active
*/
@ -4425,23 +4416,23 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
else {
mddev->ro = 1;
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
err = do_md_run(mddev);
}
break;
case read_auto:
if (mddev->pers) {
if (mddev->ro == 0)
if (md_is_rdwr(mddev))
err = md_set_readonly(mddev, NULL);
else if (mddev->ro == 1)
else if (mddev->ro == MD_RDONLY)
err = restart_array(mddev);
if (err == 0) {
mddev->ro = 2;
mddev->ro = MD_AUTO_READ;
set_disk_ro(mddev->gendisk, 0);
}
} else {
mddev->ro = 2;
mddev->ro = MD_AUTO_READ;
err = do_md_run(mddev);
}
break;
@ -4466,7 +4457,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
wake_up(&mddev->sb_wait);
err = 0;
} else {
mddev->ro = 0;
mddev->ro = MD_RDWR;
set_disk_ro(mddev->gendisk, 0);
err = do_md_run(mddev);
}
@ -4765,7 +4756,7 @@ action_show(struct mddev *mddev, char *page)
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
type = "frozen";
else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
(md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
type = "reshape";
else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
@ -4851,11 +4842,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
if (mddev->ro == 2) {
if (mddev->ro == MD_AUTO_READ) {
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
mddev->ro = 0;
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@ -5083,8 +5074,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
goto out_unlock;
err = -EBUSY;
if (max < mddev->resync_max &&
mddev->ro == 0 &&
if (max < mddev->resync_max && md_is_rdwr(mddev) &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
goto out_unlock;
@ -5813,8 +5803,8 @@ int md_run(struct mddev *mddev)
continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev);
if (mddev->ro != 1 && rdev_read_only(rdev)) {
mddev->ro = 1;
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY;
if (mddev->gendisk)
set_disk_ro(mddev->gendisk, 1);
}
@ -5917,8 +5907,8 @@ int md_run(struct mddev *mddev)
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
if (start_readonly && md_is_rdwr(mddev))
mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
err = pers->run(mddev);
if (err)
@ -5996,8 +5986,8 @@ int md_run(struct mddev *mddev)
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
} else if (mddev->ro == MD_AUTO_READ)
mddev->ro = MD_RDWR;
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
@ -6015,7 +6005,7 @@ int md_run(struct mddev *mddev)
if (rdev->raid_disk >= 0)
sysfs_link_rdev(mddev, rdev); /* failure here is OK */
if (mddev->degraded && !mddev->ro)
if (mddev->degraded && md_is_rdwr(mddev))
/* This ensures that recovering status is reported immediately
* via sysfs - until a lack of spares is confirmed.
*/
@ -6105,7 +6095,7 @@ static int restart_array(struct mddev *mddev)
return -ENXIO;
if (!mddev->pers)
return -EINVAL;
if (!mddev->ro)
if (md_is_rdwr(mddev))
return -EBUSY;
rcu_read_lock();
@ -6124,7 +6114,7 @@ static int restart_array(struct mddev *mddev)
return -EROFS;
mddev->safemode = 0;
mddev->ro = 0;
mddev->ro = MD_RDWR;
set_disk_ro(disk, 0);
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
/* Kick recovery or resync if necessary */
@ -6151,7 +6141,7 @@ static void md_clean(struct mddev *mddev)
mddev->clevel[0] = 0;
mddev->flags = 0;
mddev->sb_flags = 0;
mddev->ro = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
mddev->chunk_sectors = 0;
mddev->ctime = mddev->utime = 0;
@ -6203,7 +6193,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
md_bitmap_flush(mddev);
if (mddev->ro == 0 &&
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
mddev->sb_flags)) {
/* mark array as shutdown cleanly */
@ -6312,9 +6302,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
__md_stop_writes(mddev);
err = -ENXIO;
if (mddev->ro==1)
if (mddev->ro == MD_RDONLY)
goto out;
mddev->ro = 1;
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@ -6371,7 +6361,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
return -EBUSY;
}
if (mddev->pers) {
if (mddev->ro)
if (!md_is_rdwr(mddev))
set_disk_ro(disk, 0);
__md_stop_writes(mddev);
@ -6388,8 +6378,8 @@ static int do_md_stop(struct mddev *mddev, int mode,
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
if (mddev->ro)
mddev->ro = 0;
if (!md_is_rdwr(mddev))
mddev->ro = MD_RDWR;
} else
mutex_unlock(&mddev->open_mutex);
/*
@ -7204,7 +7194,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->sync_thread)
return -EBUSY;
if (mddev->ro)
if (!md_is_rdwr(mddev))
return -EROFS;
rdev_for_each(rdev, mddev) {
@ -7234,7 +7224,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
/* change the number of raid disks */
if (mddev->pers->check_reshape == NULL)
return -EINVAL;
if (mddev->ro)
if (!md_is_rdwr(mddev))
return -EROFS;
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
@ -7464,6 +7454,40 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
{
mdu_array_info_t info;
int err;
if (!argp)
memset(&info, 0, sizeof(info));
else if (copy_from_user(&info, argp, sizeof(info)))
return -EFAULT;
if (mddev->pers) {
err = update_array_info(mddev, &info);
if (err)
pr_warn("md: couldn't update array info. %d\n", err);
return err;
}
if (!list_empty(&mddev->disks)) {
pr_warn("md: array %s already has disks!\n", mdname(mddev));
return -EBUSY;
}
if (mddev->raid_disks) {
pr_warn("md: array %s already initialised!\n", mdname(mddev));
return -EBUSY;
}
err = md_set_array_info(mddev, &info);
if (err)
pr_warn("md: couldn't set array info. %d\n", err);
return err;
}
static int md_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@ -7569,36 +7593,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
if (cmd == SET_ARRAY_INFO) {
mdu_array_info_t info;
if (!arg)
memset(&info, 0, sizeof(info));
else if (copy_from_user(&info, argp, sizeof(info))) {
err = -EFAULT;
goto unlock;
}
if (mddev->pers) {
err = update_array_info(mddev, &info);
if (err) {
pr_warn("md: couldn't update array info. %d\n", err);
goto unlock;
}
goto unlock;
}
if (!list_empty(&mddev->disks)) {
pr_warn("md: array %s already has disks!\n", mdname(mddev));
err = -EBUSY;
goto unlock;
}
if (mddev->raid_disks) {
pr_warn("md: array %s already initialised!\n", mdname(mddev));
err = -EBUSY;
goto unlock;
}
err = md_set_array_info(mddev, &info);
if (err) {
pr_warn("md: couldn't set array info. %d\n", err);
goto unlock;
}
err = __md_set_array_info(mddev, argp);
goto unlock;
}
@ -7658,26 +7653,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* The remaining ioctls are changing the state of the
* superblock, so we do not allow them on read-only arrays.
*/
if (mddev->ro && mddev->pers) {
if (mddev->ro == 2) {
mddev->ro = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* mddev_unlock will wake thread */
/* If a device failed while we were read-only, we
* need to make sure the metadata is updated now.
*/
if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
mddev_unlock(mddev);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
}
} else {
if (!md_is_rdwr(mddev) && mddev->pers) {
if (mddev->ro != MD_AUTO_READ) {
err = -EROFS;
goto unlock;
}
mddev->ro = MD_RDWR;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* mddev_unlock will wake thread */
/* If a device failed while we were read-only, we
* need to make sure the metadata is updated now.
*/
if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
mddev_unlock(mddev);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
}
}
switch (cmd) {
@ -7763,11 +7757,11 @@ static int md_set_read_only(struct block_device *bdev, bool ro)
* Transitioning to read-auto need only happen for arrays that call
* md_write_start and which are not ready for writes yet.
*/
if (!ro && mddev->ro == 1 && mddev->pers) {
if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
err = restart_array(mddev);
if (err)
goto out_unlock;
mddev->ro = 2;
mddev->ro = MD_AUTO_READ;
}
out_unlock:
@ -8241,9 +8235,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
if (mddev->pers) {
if (mddev->ro==1)
if (mddev->ro == MD_RDONLY)
seq_printf(seq, " (read-only)");
if (mddev->ro==2)
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
}
@ -8502,10 +8496,10 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
if (bio_data_dir(bi) != WRITE)
return true;
BUG_ON(mddev->ro == 1);
if (mddev->ro == 2) {
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
mddev->ro = 0;
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
@ -8556,7 +8550,7 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
{
if (bio_data_dir(bi) != WRITE)
return;
WARN_ON_ONCE(mddev->in_sync || mddev->ro);
WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
percpu_ref_get(&mddev->writes_pending);
}
EXPORT_SYMBOL(md_write_inc);
@ -8661,7 +8655,7 @@ void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
return;
if (mddev->ro)
if (!md_is_rdwr(mddev))
return;
if (!mddev->pers->sync_request)
return;
@ -8709,7 +8703,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
return;
if (mddev->ro) {/* never try to sync a read-only array */
if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
}
@ -9178,9 +9172,9 @@ static int remove_and_add_spares(struct mddev *mddev,
if (test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(Journal, &rdev->flags)) {
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
if (!md_is_rdwr(mddev) &&
!(rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
continue;
rdev->recovery_offset = 0;
@ -9278,7 +9272,8 @@ void md_check_recovery(struct mddev *mddev)
flush_signals(current);
}
if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
if (!md_is_rdwr(mddev) &&
!test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return;
if ( ! (
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
@ -9297,7 +9292,7 @@ void md_check_recovery(struct mddev *mddev)
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
if (mddev->ro) {
if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev;
if (!mddev->external && mddev->in_sync)
/* 'Blocked' flag not needed as failed devices

View File

@ -782,7 +782,6 @@ extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,

View File

@ -398,7 +398,6 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,

View File

@ -1321,7 +1321,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
read_bio->bi_end_io = raid1_end_read_request;
bio_set_op_attrs(read_bio, op, do_sync);
read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &mirror->rdev->flags) &&
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@ -2254,7 +2254,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
continue;
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
wbio->bi_opf = REQ_OP_WRITE;
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
wbio->bi_opf |= MD_FAILFAST;
@ -2419,7 +2419,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
GFP_NOIO, &mddev->bio_set);
}
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
wbio->bi_opf = REQ_OP_WRITE;
wbio->bi_iter.bi_sector = r1_bio->sector;
wbio->bi_iter.bi_size = r1_bio->sectors << 9;
@ -2770,7 +2770,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (i < conf->raid_disks)
still_degraded = 1;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
write_targets ++;
} else {
@ -2797,7 +2797,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (disk < 0)
disk = i;
}
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_opf = REQ_OP_READ;
bio->bi_end_io = end_sync_read;
read_targets++;
} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
@ -2809,7 +2809,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* if we are doing resync or repair. Otherwise, leave
* this device alone for this sync request.
*/
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
write_targets++;
}
@ -3159,6 +3159,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active
*/
if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(&conf->thread);
ret = -EINVAL;
goto abort;
}

View File

@ -1254,7 +1254,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_end_io = raid10_end_read_request;
bio_set_op_attrs(read_bio, op, do_sync);
read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@ -1301,7 +1301,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
bio_set_op_attrs(mbio, op, do_sync | do_fua);
mbio->bi_opf = op | do_sync | do_fua;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@ -2933,7 +2933,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
choose_data_offset(r10_bio, rdev);
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
wbio->bi_opf = REQ_OP_WRITE;
if (submit_bio_wait(wbio) < 0)
/* Failure! */
@ -3542,7 +3542,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_opf = REQ_OP_READ;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
from_addr = r10_bio->devs[j].addr;
@ -3567,7 +3567,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = to_addr
+ mrdev->data_offset;
bio_set_dev(bio, mrdev->bdev);
@ -3588,7 +3588,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = to_addr +
mreplace->data_offset;
bio_set_dev(bio, mreplace->bdev);
@ -3742,7 +3742,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_opf = REQ_OP_READ;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
@ -3764,7 +3764,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_opf = REQ_OP_WRITE;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
@ -4145,8 +4145,6 @@ static int raid10_run(struct mddev *mddev)
conf->thread = NULL;
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
UINT_MAX);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
raid10_set_io_opt(conf);
@ -4972,7 +4970,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
b->bi_end_io = end_reshape_write;
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
b->bi_opf = REQ_OP_WRITE;
b->bi_next = blist;
blist = b;
}

View File

@ -1565,11 +1565,12 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
if (!log)
return;
target = READ_ONCE(log->reclaim_target);
do {
target = log->reclaim_target;
if (new < target)
return;
} while (cmpxchg(&log->reclaim_target, target, new) != target);
} while (!try_cmpxchg(&log->reclaim_target, &target, new));
md_wakeup_thread(log->reclaim_thread);
}
@ -3061,7 +3062,6 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
int ret;
@ -3090,9 +3090,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
log->need_cache_flush = bdev_write_cache(rdev->bdev);
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
sizeof(rdev->mddev->uuid));

View File

@ -1301,8 +1301,6 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
{
struct request_queue *q;
if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
PPL_HEADER_SIZE) * 2) {
log->use_multippl = true;
@ -1316,8 +1314,7 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
}
log->next_io_sector = rdev->ppl.sector;
q = bdev_get_queue(rdev->bdev);
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
if (bdev_write_cache(rdev->bdev))
log->wb_cache_on = true;
}

View File

@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto out_free_cmd;
}
blk_mq_start_request(req);
nvme_start_request(req);
apple_nvme_submit_cmd(q, cmnd);
return BLK_STS_OK;
@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
if (!dead && shutdown && freeze)
nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&anv->ctrl);
nvme_quiesce_io_queues(&anv->ctrl);
if (!dead) {
if (READ_ONCE(anv->ioq.enabled)) {
@ -829,15 +829,13 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
apple_nvme_remove_cq(anv);
}
if (shutdown)
nvme_shutdown_ctrl(&anv->ctrl);
nvme_disable_ctrl(&anv->ctrl);
nvme_disable_ctrl(&anv->ctrl, shutdown);
}
WRITE_ONCE(anv->ioq.enabled, false);
WRITE_ONCE(anv->adminq.enabled, false);
mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */
nvme_stop_admin_queue(&anv->ctrl);
nvme_quiesce_admin_queue(&anv->ctrl);
/* last chance to complete any requests before nvme_cancel_request */
spin_lock_irqsave(&anv->lock, flags);
@ -854,8 +852,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
nvme_start_queues(&anv->ctrl);
nvme_start_admin_queue(&anv->ctrl);
nvme_unquiesce_io_queues(&anv->ctrl);
nvme_unquiesce_admin_queue(&anv->ctrl);
}
}
@ -1093,7 +1091,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
dev_dbg(anv->dev, "Starting admin queue");
apple_nvme_init_queue(&anv->adminq);
nvme_start_admin_queue(&anv->ctrl);
nvme_unquiesce_admin_queue(&anv->ctrl);
if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(anv->ctrl.device,
@ -1102,7 +1100,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
goto out;
}
ret = nvme_init_ctrl_finish(&anv->ctrl);
ret = nvme_init_ctrl_finish(&anv->ctrl, false);
if (ret)
goto out;
@ -1127,7 +1125,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
anv->ctrl.queue_count = nr_io_queues + 1;
nvme_start_queues(&anv->ctrl);
nvme_unquiesce_io_queues(&anv->ctrl);
nvme_wait_freeze(&anv->ctrl);
blk_mq_update_nr_hw_queues(&anv->tagset, 1);
nvme_unfreeze(&anv->ctrl);
@ -1153,7 +1151,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&anv->ctrl);
apple_nvme_disable(anv, false);
nvme_kill_queues(&anv->ctrl);
nvme_mark_namespaces_dead(&anv->ctrl);
if (!queue_work(nvme_wq, &anv->remove_work))
nvme_put_ctrl(&anv->ctrl);
}
@ -1507,14 +1505,6 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
if (!blk_get_queue(anv->ctrl.admin_q)) {
nvme_start_admin_queue(&anv->ctrl);
blk_mq_destroy_queue(anv->ctrl.admin_q);
anv->ctrl.admin_q = NULL;
ret = -ENODEV;
goto put_dev;
}
nvme_reset_ctrl(&anv->ctrl);
async_schedule(apple_nvme_async_probe, anv);

View File

@ -13,6 +13,10 @@
#include "fabrics.h"
#include <linux/nvme-auth.h>
#define CHAP_BUF_SIZE 4096
static struct kmem_cache *nvme_chap_buf_cache;
static mempool_t *nvme_chap_buf_pool;
struct nvme_dhchap_queue_context {
struct list_head entry;
struct work_struct auth_work;
@ -20,7 +24,6 @@ struct nvme_dhchap_queue_context {
struct crypto_shash *shash_tfm;
struct crypto_kpp *dh_tfm;
void *buf;
size_t buf_size;
int qid;
int error;
u32 s1;
@ -47,6 +50,12 @@ struct nvme_dhchap_queue_context {
#define nvme_auth_queue_from_qid(ctrl, qid) \
(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl)
{
return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues +
ctrl->opts->nr_poll_queues + 1;
}
static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
void *data, size_t data_len, bool auth_send)
{
@ -112,7 +121,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
if (chap->buf_size < size) {
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return -EINVAL;
}
@ -147,7 +156,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
const char *hmac_name, *kpp_name;
if (chap->buf_size < size) {
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return NVME_SC_INVALID_FIELD;
}
@ -197,12 +206,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
return NVME_SC_AUTH_REQUIRED;
}
/* Reset host response if the hash had been changed */
if (chap->hash_id != data->hashid) {
kfree(chap->host_response);
chap->host_response = NULL;
}
chap->hash_id = data->hashid;
chap->hash_len = data->hl;
dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
@ -219,14 +222,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
return NVME_SC_AUTH_REQUIRED;
}
/* Clear host and controller key to avoid accidental reuse */
kfree_sensitive(chap->host_key);
chap->host_key = NULL;
chap->host_key_len = 0;
kfree_sensitive(chap->ctrl_key);
chap->ctrl_key = NULL;
chap->ctrl_key_len = 0;
if (chap->dhgroup_id == data->dhgid &&
(data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
dev_dbg(ctrl->device,
@ -302,7 +297,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
if (chap->host_key_len)
size += chap->host_key_len;
if (chap->buf_size < size) {
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return -EINVAL;
}
@ -344,10 +339,10 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
struct nvmf_auth_dhchap_success1_data *data = chap->buf;
size_t size = sizeof(*data);
if (ctrl->ctrl_key)
if (chap->ctrl_key)
size += chap->hash_len;
if (chap->buf_size < size) {
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return NVME_SC_INVALID_FIELD;
}
@ -521,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
ret = PTR_ERR(ctrl_response);
return ret;
}
ret = crypto_shash_setkey(chap->shash_tfm,
ctrl_response, ctrl->ctrl_key->len);
if (ret) {
@ -621,9 +617,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
if (ret) {
dev_dbg(ctrl->device,
"failed to generate public key, error %d\n", ret);
kfree(chap->host_key);
chap->host_key = NULL;
chap->host_key_len = 0;
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return ret;
}
@ -643,9 +636,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
if (ret) {
dev_dbg(ctrl->device,
"failed to generate shared secret, error %d\n", ret);
kfree_sensitive(chap->sess_key);
chap->sess_key = NULL;
chap->sess_key_len = 0;
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return ret;
}
@ -654,7 +644,7 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
return 0;
}
static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
{
kfree_sensitive(chap->host_response);
chap->host_response = NULL;
@ -674,24 +664,20 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
chap->transaction = 0;
memset(chap->c1, 0, sizeof(chap->c1));
memset(chap->c2, 0, sizeof(chap->c2));
mempool_free(chap->buf, nvme_chap_buf_pool);
chap->buf = NULL;
}
static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
__nvme_auth_reset(chap);
nvme_auth_reset_dhchap(chap);
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
crypto_free_kpp(chap->dh_tfm);
kfree_sensitive(chap->ctrl_key);
kfree_sensitive(chap->host_key);
kfree_sensitive(chap->sess_key);
kfree_sensitive(chap->host_response);
kfree(chap->buf);
kfree(chap);
}
static void __nvme_auth_work(struct work_struct *work)
static void nvme_queue_auth_work(struct work_struct *work)
{
struct nvme_dhchap_queue_context *chap =
container_of(work, struct nvme_dhchap_queue_context, auth_work);
@ -699,6 +685,16 @@ static void __nvme_auth_work(struct work_struct *work)
size_t tl;
int ret = 0;
/*
* Allocate a large enough buffer for the entire negotiation:
* 4k is enough to ffdhe8192.
*/
chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL);
if (!chap->buf) {
chap->error = -ENOMEM;
return;
}
chap->transaction = ctrl->transaction++;
/* DH-HMAC-CHAP Step 1: send negotiate */
@ -720,8 +716,9 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
__func__, chap->qid);
memset(chap->buf, 0, chap->buf_size);
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
memset(chap->buf, 0, CHAP_BUF_SIZE);
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
false);
if (ret) {
dev_warn(ctrl->device,
"qid %d failed to receive challenge, %s %d\n",
@ -757,11 +754,14 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d host response\n",
__func__, chap->qid);
mutex_lock(&ctrl->dhchap_auth_mutex);
ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
if (ret) {
mutex_unlock(&ctrl->dhchap_auth_mutex);
chap->error = ret;
goto fail2;
}
mutex_unlock(&ctrl->dhchap_auth_mutex);
/* DH-HMAC-CHAP Step 3: send reply */
dev_dbg(ctrl->device, "%s: qid %d send reply\n",
@ -783,8 +783,9 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
__func__, chap->qid);
memset(chap->buf, 0, chap->buf_size);
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
memset(chap->buf, 0, CHAP_BUF_SIZE);
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
false);
if (ret) {
dev_warn(ctrl->device,
"qid %d failed to receive success1, %s %d\n",
@ -801,16 +802,19 @@ static void __nvme_auth_work(struct work_struct *work)
return;
}
mutex_lock(&ctrl->dhchap_auth_mutex);
if (ctrl->ctrl_key) {
dev_dbg(ctrl->device,
"%s: qid %d controller response\n",
__func__, chap->qid);
ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
if (ret) {
mutex_unlock(&ctrl->dhchap_auth_mutex);
chap->error = ret;
goto fail2;
}
}
mutex_unlock(&ctrl->dhchap_auth_mutex);
ret = nvme_auth_process_dhchap_success1(ctrl, chap);
if (ret) {
@ -819,7 +823,7 @@ static void __nvme_auth_work(struct work_struct *work)
goto fail2;
}
if (ctrl->ctrl_key) {
if (chap->ctrl_key) {
/* DH-HMAC-CHAP Step 5: send success2 */
dev_dbg(ctrl->device, "%s: qid %d send success2\n",
__func__, chap->qid);
@ -860,42 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
return -ENOKEY;
}
mutex_lock(&ctrl->dhchap_auth_mutex);
/* Check if the context is already queued */
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
WARN_ON(!chap->buf);
if (chap->qid == qid) {
dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
mutex_unlock(&ctrl->dhchap_auth_mutex);
flush_work(&chap->auth_work);
__nvme_auth_reset(chap);
queue_work(nvme_wq, &chap->auth_work);
return 0;
}
}
chap = kzalloc(sizeof(*chap), GFP_KERNEL);
if (!chap) {
mutex_unlock(&ctrl->dhchap_auth_mutex);
return -ENOMEM;
}
chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
chap->ctrl = ctrl;
/*
* Allocate a large enough buffer for the entire negotiation:
* 4k should be enough to ffdhe8192.
*/
chap->buf_size = 4096;
chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
if (!chap->buf) {
mutex_unlock(&ctrl->dhchap_auth_mutex);
kfree(chap);
return -ENOMEM;
}
INIT_WORK(&chap->auth_work, __nvme_auth_work);
list_add(&chap->entry, &ctrl->dhchap_auth_list);
mutex_unlock(&ctrl->dhchap_auth_mutex);
chap = &ctrl->dhchap_ctxs[qid];
cancel_work_sync(&chap->auth_work);
queue_work(nvme_wq, &chap->auth_work);
return 0;
}
@ -906,40 +876,28 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
struct nvme_dhchap_queue_context *chap;
int ret;
mutex_lock(&ctrl->dhchap_auth_mutex);
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
if (chap->qid != qid)
continue;
mutex_unlock(&ctrl->dhchap_auth_mutex);
flush_work(&chap->auth_work);
ret = chap->error;
return ret;
}
mutex_unlock(&ctrl->dhchap_auth_mutex);
return -ENXIO;
chap = &ctrl->dhchap_ctxs[qid];
flush_work(&chap->auth_work);
ret = chap->error;
/* clear sensitive info */
nvme_auth_reset_dhchap(chap);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_wait);
void nvme_auth_reset(struct nvme_ctrl *ctrl)
{
struct nvme_dhchap_queue_context *chap;
mutex_lock(&ctrl->dhchap_auth_mutex);
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
mutex_unlock(&ctrl->dhchap_auth_mutex);
flush_work(&chap->auth_work);
__nvme_auth_reset(chap);
}
mutex_unlock(&ctrl->dhchap_auth_mutex);
}
EXPORT_SYMBOL_GPL(nvme_auth_reset);
static void nvme_dhchap_auth_work(struct work_struct *work)
static void nvme_ctrl_auth_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, dhchap_auth_work);
int ret, q;
/*
* If the ctrl is no connected, bail as reconnect will handle
* authentication.
*/
if (ctrl->state != NVME_CTRL_LIVE)
return;
/* Authenticate admin queue first */
ret = nvme_auth_negotiate(ctrl, 0);
if (ret) {
@ -968,43 +926,75 @@ static void nvme_dhchap_auth_work(struct work_struct *work)
* Failure is a soft-state; credentials remain valid until
* the controller terminates the connection.
*/
for (q = 1; q < ctrl->queue_count; q++) {
ret = nvme_auth_wait(ctrl, q);
if (ret)
dev_warn(ctrl->device,
"qid %d: authentication failed\n", q);
}
}
void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
{
INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
struct nvme_dhchap_queue_context *chap;
int i, ret;
mutex_init(&ctrl->dhchap_auth_mutex);
INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work);
if (!ctrl->opts)
return;
nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
return 0;
ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret,
&ctrl->host_key);
if (ret)
return ret;
ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret,
&ctrl->ctrl_key);
if (ret)
goto err_free_dhchap_secret;
if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret)
return ret;
ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl),
sizeof(*chap), GFP_KERNEL);
if (!ctrl->dhchap_ctxs) {
ret = -ENOMEM;
goto err_free_dhchap_ctrl_secret;
}
for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) {
chap = &ctrl->dhchap_ctxs[i];
chap->qid = i;
chap->ctrl = ctrl;
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
}
return 0;
err_free_dhchap_ctrl_secret:
nvme_auth_free_key(ctrl->ctrl_key);
ctrl->ctrl_key = NULL;
err_free_dhchap_secret:
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = NULL;
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
void nvme_auth_stop(struct nvme_ctrl *ctrl)
{
struct nvme_dhchap_queue_context *chap = NULL, *tmp;
cancel_work_sync(&ctrl->dhchap_auth_work);
mutex_lock(&ctrl->dhchap_auth_mutex);
list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
cancel_work_sync(&chap->auth_work);
mutex_unlock(&ctrl->dhchap_auth_mutex);
}
EXPORT_SYMBOL_GPL(nvme_auth_stop);
void nvme_auth_free(struct nvme_ctrl *ctrl)
{
struct nvme_dhchap_queue_context *chap = NULL, *tmp;
int i;
mutex_lock(&ctrl->dhchap_auth_mutex);
list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
list_del_init(&chap->entry);
flush_work(&chap->auth_work);
__nvme_auth_free(chap);
if (ctrl->dhchap_ctxs) {
for (i = 0; i < ctrl_max_dhchaps(ctrl); i++)
nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]);
kfree(ctrl->dhchap_ctxs);
}
mutex_unlock(&ctrl->dhchap_auth_mutex);
if (ctrl->host_key) {
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = NULL;
@ -1015,3 +1005,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl)
}
}
EXPORT_SYMBOL_GPL(nvme_auth_free);
int __init nvme_init_auth(void)
{
nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache",
CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL);
if (!nvme_chap_buf_cache)
return -ENOMEM;
nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab,
mempool_free_slab, nvme_chap_buf_cache);
if (!nvme_chap_buf_pool)
goto err_destroy_chap_buf_cache;
return 0;
err_destroy_chap_buf_cache:
kmem_cache_destroy(nvme_chap_buf_cache);
return -ENOMEM;
}
void __exit nvme_exit_auth(void)
{
mempool_destroy(nvme_chap_buf_pool);
kmem_cache_destroy(nvme_chap_buf_cache);
}

View File

@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req)
nvme_log_error(req);
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
blk_mq_end_request(req, status);
}
@ -851,8 +853,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
if (nvme_ns_has_pi(ns)) {
cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
@ -1118,11 +1123,12 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_ctrl_finish(ctrl);
if (effects & NVME_CMD_EFFECTS_CCC) {
dev_info(ctrl->device,
"controller capabilities changed, reset may be required to take effect.\n");
}
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
flush_work(&ctrl->scan_work);
@ -2003,6 +2009,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
}
}
/*
* Only set the DEAC bit if the device guarantees that reads from
* deallocated data return zeroes. While the DEAC bit does not
* require that, it must be a no-op if reads from deallocated data
* do not return zeroes.
*/
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
@ -2179,7 +2193,7 @@ const struct pr_ops nvme_pr_ops = {
};
#ifdef CONFIG_BLK_SED_OPAL
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send)
{
struct nvme_ctrl *ctrl = data;
@ -2196,7 +2210,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
NVME_QID_ANY, 1, 0);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
{
if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
if (!ctrl->opal_dev)
ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
else if (was_suspended)
opal_unlock_from_suspend(ctrl->opal_dev);
} else {
free_opal_dev(ctrl->opal_dev);
ctrl->opal_dev = NULL;
}
}
#else
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
{
}
#endif /* CONFIG_BLK_SED_OPAL */
#ifdef CONFIG_BLK_DEV_ZONED
@ -2221,16 +2251,17 @@ static const struct block_device_operations nvme_bdev_ops = {
.pr_ops = &nvme_pr_ops,
};
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
u32 timeout, const char *op)
{
unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
unsigned long timeout_jiffies = jiffies + timeout * HZ;
u32 csts;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if (csts == ~0)
return -ENODEV;
if ((csts & NVME_CSTS_RDY) == bit)
if ((csts & mask) == val)
break;
usleep_range(1000, 2000);
@ -2239,7 +2270,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
if (time_after(jiffies, timeout_jiffies)) {
dev_err(ctrl->device,
"Device not ready; aborting %s, CSTS=0x%x\n",
enabled ? "initialisation" : "reset", csts);
op, csts);
return -ENODEV;
}
}
@ -2247,27 +2278,29 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
return ret;
}
/*
* If the device has been passed off to us in an enabled state, just clear
* the enabled bit. The spec says we should set the 'shutdown notification
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
if (shutdown)
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
else
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
if (shutdown) {
return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
NVME_CSTS_SHST_CMPLT,
ctrl->shutdown_timeout, "shutdown");
}
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
(NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
@ -2332,41 +2365,11 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, timeout, true);
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
(timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
{
unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
u32 csts;
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
break;
msleep(100);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(ctrl->device,
"Device shutdown incomplete; abort shutdown\n");
return -ENODEV;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
{
__le64 ts;
@ -3049,7 +3052,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return 0;
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CS_CTRL;
@ -3229,7 +3232,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
* register in our nvme_ctrl structure. This should be called as soon as
* the admin queue is fully up and running.
*/
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
{
int ret;
@ -3260,6 +3263,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
nvme_configure_opal(ctrl, was_suspended);
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
/*
* Do not return errors unless we are in a controller reset,
@ -3745,15 +3750,19 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_secret)) {
struct nvme_dhchap_key *key, *host_key;
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
ret = nvme_auth_generate_key(dhchap_secret, &key);
if (ret)
return ret;
kfree(opts->dhchap_secret);
opts->dhchap_secret = dhchap_secret;
/* Key has changed; re-authentication with new key */
nvme_auth_reset(ctrl);
host_key = ctrl->host_key;
mutex_lock(&ctrl->dhchap_auth_mutex);
ctrl->host_key = key;
mutex_unlock(&ctrl->dhchap_auth_mutex);
nvme_auth_free_key(host_key);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
@ -3795,15 +3804,19 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
struct nvme_dhchap_key *key, *ctrl_key;
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
ret = nvme_auth_generate_key(dhchap_secret, &key);
if (ret)
return ret;
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = dhchap_secret;
/* Key has changed; re-authentication with new key */
nvme_auth_reset(ctrl);
ctrl_key = ctrl->ctrl_key;
mutex_lock(&ctrl->dhchap_auth_mutex);
ctrl->ctrl_key = key;
mutex_unlock(&ctrl->dhchap_auth_mutex);
nvme_auth_free_key(ctrl_key);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
@ -3875,10 +3888,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return a->mode;
}
static const struct attribute_group nvme_dev_attrs_group = {
const struct attribute_group nvme_dev_attrs_group = {
.attrs = nvme_dev_attrs,
.is_visible = nvme_dev_attrs_are_visible,
};
EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
static const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
@ -4333,10 +4347,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (test_bit(NVME_NS_DEAD, &ns->flags))
goto out;
ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
@ -4407,7 +4417,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
if (ns->head->ns_id > nsid)
list_move_tail(&ns->list, &rm_list);
}
up_write(&ctrl->namespaces_rwsem);
@ -4424,9 +4434,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
u32 prev = 0;
int ret = 0, i;
if (nvme_ctrl_limited_cns(ctrl))
return -EOPNOTSUPP;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
@ -4534,8 +4541,18 @@ static void nvme_scan_work(struct work_struct *work)
}
mutex_lock(&ctrl->scan_lock);
if (nvme_scan_ns_list(ctrl) != 0)
if (nvme_ctrl_limited_cns(ctrl)) {
nvme_scan_ns_sequential(ctrl);
} else {
/*
* Fall back to sequential scan if DNR is set to handle broken
* devices which should support Identify NS List (as per the VS
* they report) but don't actually support it.
*/
ret = nvme_scan_ns_list(ctrl);
if (ret > 0 && ret & NVME_SC_DNR)
nvme_scan_ns_sequential(ctrl);
}
mutex_unlock(&ctrl->scan_lock);
}
@ -4565,8 +4582,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
if (ctrl->state == NVME_CTRL_DEAD)
nvme_kill_queues(ctrl);
if (ctrl->state == NVME_CTRL_DEAD) {
nvme_mark_namespaces_dead(ctrl);
nvme_unquiesce_io_queues(ctrl);
}
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
@ -4692,7 +4711,7 @@ static void nvme_fw_act_work(struct work_struct *work)
fw_act_timeout = jiffies +
msecs_to_jiffies(admin_timeout * 1000);
nvme_stop_queues(ctrl);
nvme_quiesce_io_queues(ctrl);
while (nvme_ctrl_pp_status(ctrl)) {
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
@ -4706,7 +4725,7 @@ static void nvme_fw_act_work(struct work_struct *work)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
nvme_start_queues(ctrl);
nvme_unquiesce_io_queues(ctrl);
/* read FW slot information to clear the AER */
nvme_get_fw_slot_info(ctrl);
@ -4811,8 +4830,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size)
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
int ret;
@ -4822,7 +4840,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ctrl->ops->flags & NVME_F_FABRICS)
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
set->flags = flags;
set->flags = BLK_MQ_F_NO_SCHED;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@ -4850,6 +4870,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
out_cleanup_admin_q:
blk_mq_destroy_queue(ctrl->admin_q);
blk_put_queue(ctrl->admin_q);
out_free_tagset:
blk_mq_free_tag_set(ctrl->admin_tagset);
return ret;
@ -4859,14 +4880,17 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
blk_mq_destroy_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS)
blk_put_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->fabrics_q);
blk_put_queue(ctrl->fabrics_q);
}
blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
const struct blk_mq_ops *ops, unsigned int nr_maps,
unsigned int cmd_size)
{
int ret;
@ -4874,15 +4898,23 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = ctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
/*
* Some Apple controllers requires tags to be unique across admin and
* the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
*/
if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH;
else if (ctrl->ops->flags & NVME_F_FABRICS)
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
set->flags = flags;
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size,
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
if (ops->map_queues)
set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
set->nr_maps = nr_maps;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ret;
@ -4893,6 +4925,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
ctrl->connect_q);
}
ctrl->tagset = set;
@ -4906,8 +4940,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
{
if (ctrl->ops->flags & NVME_F_FABRICS)
if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->connect_q);
blk_put_queue(ctrl->connect_q);
}
blk_mq_free_tag_set(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
@ -4943,7 +4979,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
nvme_start_queues(ctrl);
nvme_unquiesce_io_queues(ctrl);
nvme_mpath_update(ctrl);
}
@ -4988,6 +5024,7 @@ static void nvme_free_ctrl(struct device *dev)
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
free_opal_dev(ctrl->opal_dev);
if (subsys) {
mutex_lock(&nvme_subsystems_lock);
@ -5053,7 +5090,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->instance);
ctrl->device->class = nvme_class;
ctrl->device->parent = ctrl->dev;
ctrl->device->groups = nvme_dev_attr_groups;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
else
ctrl->device->groups = nvme_dev_attr_groups;
ctrl->device->release = nvme_free_ctrl;
dev_set_drvdata(ctrl->device, ctrl);
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
@ -5077,9 +5117,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
nvme_mpath_init_ctrl(ctrl);
nvme_auth_init_ctrl(ctrl);
ret = nvme_auth_init_ctrl(ctrl);
if (ret)
goto out_free_cdev;
return 0;
out_free_cdev:
cdev_device_del(&ctrl->cdev, ctrl->device);
out_free_name:
nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
@ -5092,62 +5136,17 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
static void nvme_start_ns_queue(struct nvme_ns *ns)
{
if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
blk_mq_unquiesce_queue(ns->queue);
}
static void nvme_stop_ns_queue(struct nvme_ns *ns)
{
if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
blk_mq_quiesce_queue(ns->queue);
else
blk_mq_wait_quiesce_done(ns->queue);
}
/*
* Prepare a queue for teardown.
*
* This must forcibly unquiesce queues to avoid blocking dispatch, and only set
* the capacity to 0 after that to avoid blocking dispatchers that may be
* holding bd_butex. This will end buffered writers dirtying pages that can't
* be synced.
*/
static void nvme_set_queue_dying(struct nvme_ns *ns)
{
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
blk_mark_disk_dead(ns->disk);
nvme_start_ns_queue(ns);
set_capacity_and_notify(ns->disk, 0);
}
/**
* nvme_kill_queues(): Ends all namespace queues
* @ctrl: the dead controller that needs to end
*
* Call this function when the driver determines it is unable to get the
* controller in a state capable of servicing IO.
*/
void nvme_kill_queues(struct nvme_ctrl *ctrl)
/* let I/O to all namespaces fail in preparation for surprise removal */
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
/* Forcibly unquiesce queues to avoid blocking dispatch */
if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
nvme_start_admin_queue(ctrl);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_set_queue_dying(ns);
blk_mark_disk_dead(ns->disk);
up_read(&ctrl->namespaces_rwsem);
}
EXPORT_SYMBOL_GPL(nvme_kill_queues);
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
@ -5197,43 +5196,41 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
void nvme_stop_queues(struct nvme_ctrl *ctrl)
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_stop_ns_queue(ns);
up_read(&ctrl->namespaces_rwsem);
if (!ctrl->tagset)
return;
if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
blk_mq_quiesce_tagset(ctrl->tagset);
else
blk_mq_wait_quiesce_done(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_stop_queues);
EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
void nvme_start_queues(struct nvme_ctrl *ctrl)
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_start_ns_queue(ns);
up_read(&ctrl->namespaces_rwsem);
if (!ctrl->tagset)
return;
if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
blk_mq_unquiesce_tagset(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
{
if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
blk_mq_quiesce_queue(ctrl->admin_q);
else
blk_mq_wait_quiesce_done(ctrl->admin_q);
blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
}
EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
{
if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
blk_mq_unquiesce_queue(ctrl->admin_q);
}
EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
@ -5344,8 +5341,13 @@ static int __init nvme_core_init(void)
goto unregister_generic_ns;
}
result = nvme_init_auth();
if (result)
goto destroy_ns_chr;
return 0;
destroy_ns_chr:
class_destroy(nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
@ -5366,6 +5368,7 @@ static int __init nvme_core_init(void)
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
class_destroy(nvme_ns_chr_class);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);

View File

@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
fc_dma_unmap_single(lport->dev, lsop->rspdma,
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
kfree(lsop->rspbuf);
kfree(lsop->rqstbuf);
kfree(lsop);
nvme_fc_rport_put(rport);
@ -1699,6 +1701,15 @@ nvme_fc_handle_ls_rqst_work(struct work_struct *work)
spin_unlock_irqrestore(&rport->lock, flags);
}
static
void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport,
struct fcnvme_ls_rqst_w0 *w0)
{
dev_info(lport->dev, "RCV %s LS failed: No memory\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
}
/**
* nvme_fc_rcv_ls_req - transport entry point called by an LLDD
* upon the reception of a NVME LS request.
@ -1751,20 +1762,20 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
goto out_put;
}
lsop = kzalloc(sizeof(*lsop) +
sizeof(union nvmefc_ls_requests) +
sizeof(union nvmefc_ls_responses),
GFP_KERNEL);
lsop = kzalloc(sizeof(*lsop), GFP_KERNEL);
if (!lsop) {
dev_info(lport->dev,
"RCV %s LS failed: No memory\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
nvme_fc_rcv_ls_req_err_msg(lport, w0);
ret = -ENOMEM;
goto out_put;
}
lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1];
lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1];
lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL);
lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL);
if (!lsop->rqstbuf || !lsop->rspbuf) {
nvme_fc_rcv_ls_req_err_msg(lport, w0);
ret = -ENOMEM;
goto out_free;
}
lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf,
sizeof(*lsop->rspbuf),
@ -1801,6 +1812,8 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
fc_dma_unmap_single(lport->dev, lsop->rspdma,
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
out_free:
kfree(lsop->rspbuf);
kfree(lsop->rqstbuf);
kfree(lsop);
out_put:
nvme_fc_rport_put(rport);
@ -2391,7 +2404,7 @@ nvme_fc_ctrl_free(struct kref *ref)
list_del(&ctrl->ctrl_list);
spin_unlock_irqrestore(&ctrl->rport->lock, flags);
nvme_start_admin_queue(&ctrl->ctrl);
nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvme_remove_admin_tag_set(&ctrl->ctrl);
kfree(ctrl->queues);
@ -2492,20 +2505,20 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
* (but with error status).
*/
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
if (start_queues)
nvme_start_queues(&ctrl->ctrl);
nvme_unquiesce_io_queues(&ctrl->ctrl);
}
/*
* Other transports, which don't have link-level contexts bound
* to sqe's, would try to gracefully shutdown the controller by
* writing the registers for shutdown and polling (call
* nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
* nvme_disable_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
* there was no indication of how live the controlelr is on the
* link, don't send more io to create more contexts for the
@ -2516,13 +2529,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
/*
* clean up the admin queue. Same thing as above.
*/
nvme_stop_admin_queue(&ctrl->ctrl);
nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
if (start_queues)
nvme_start_admin_queue(&ctrl->ctrl);
nvme_unquiesce_admin_queue(&ctrl->ctrl);
}
static void
@ -2732,7 +2745,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
atomic_set(&op->state, FCPOP_STATE_ACTIVE);
if (!(op->flags & FCOP_FLAGS_AEN))
blk_mq_start_request(op->rq);
nvme_start_request(op->rq);
cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
@ -2903,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
nvme_fc_init_io_queues(ctrl);
ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
&nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE,
&nvme_fc_mq_ops, 1,
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz));
if (ret)
@ -3104,9 +3117,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
(ilog2(SZ_4K) - 9);
nvme_start_admin_queue(&ctrl->ctrl);
nvme_unquiesce_admin_queue(&ctrl->ctrl);
ret = nvme_init_ctrl_finish(&ctrl->ctrl);
ret = nvme_init_ctrl_finish(&ctrl->ctrl, false);
if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
goto out_disconnect_admin_queue;
@ -3250,10 +3263,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_fc_free_queue(&ctrl->queues[0]);
/* re-enable the admin_q so anything new can fast fail */
nvme_start_admin_queue(&ctrl->ctrl);
nvme_unquiesce_admin_queue(&ctrl->ctrl);
/* resume the io queues so that things will fast fail */
nvme_start_queues(&ctrl->ctrl);
nvme_unquiesce_io_queues(&ctrl->ctrl);
nvme_fc_ctlr_inactive_on_rport(ctrl);
}
@ -3509,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
nvme_fc_init_queue(ctrl, 0);
ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
&nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED,
&nvme_fc_admin_mq_ops,
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz));
if (ret)

View File

@ -8,6 +8,50 @@
#include <linux/io_uring.h>
#include "nvme.h"
static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
fmode_t mode)
{
if (capable(CAP_SYS_ADMIN))
return true;
/*
* Do not allow unprivileged processes to send vendor specific or fabrics
* commands as we can't be sure about their effects.
*/
if (c->common.opcode >= nvme_cmd_vendor_start ||
c->common.opcode == nvme_fabrics_command)
return false;
/*
* Do not allow unprivileged passthrough of admin commands except
* for a subset of identify commands that contain information required
* to form proper I/O commands in userspace and do not expose any
* potentially sensitive information.
*/
if (!ns) {
if (c->common.opcode == nvme_admin_identify) {
switch (c->identify.cns) {
case NVME_ID_CNS_NS:
case NVME_ID_CNS_CS_NS:
case NVME_ID_CNS_NS_CS_INDEP:
case NVME_ID_CNS_CS_CTRL:
case NVME_ID_CNS_CTRL:
return true;
}
}
return false;
}
/*
* Only allow I/O commands that transfer data to the controller if the
* special file is open for writing, but always allow I/O commands that
* transfer data from the controller.
*/
if (nvme_is_write(c))
return mode & FMODE_WRITE;
return true;
}
/*
* Convert integer values from ioctl structures to user pointers, silently
* ignoring the upper bits in the compat case to match behaviour of 32-bit
@ -261,7 +305,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
struct nvme_passthru_cmd __user *ucmd, fmode_t mode)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
@ -269,8 +313,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u64 result;
int status;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@ -291,6 +333,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
if (!nvme_cmd_allowed(ns, &c, mode))
return -EACCES;
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@ -308,15 +353,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
}
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd64 __user *ucmd, bool vec)
struct nvme_passthru_cmd64 __user *ucmd, bool vec,
fmode_t mode)
{
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
int status;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@ -337,6 +381,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
if (!nvme_cmd_allowed(ns, &c, mode))
return -EACCES;
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@ -483,9 +530,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
void *meta = NULL;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
c.common.opcode = READ_ONCE(cmd->opcode);
c.common.flags = READ_ONCE(cmd->flags);
if (c.common.flags)
@ -507,6 +551,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode))
return -EACCES;
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
@ -570,13 +617,13 @@ static bool is_ctrl_ioctl(unsigned int cmd)
}
static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
void __user *argp)
void __user *argp, fmode_t mode)
{
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
return nvme_user_cmd(ctrl, NULL, argp, mode);
case NVME_IOCTL_ADMIN64_CMD:
return nvme_user_cmd64(ctrl, NULL, argp, false);
return nvme_user_cmd64(ctrl, NULL, argp, false, mode);
default:
return sed_ioctl(ctrl->opal_dev, cmd, argp);
}
@ -601,14 +648,14 @@ struct nvme_user_io32 {
#endif /* COMPAT_FOR_U64_ALIGNMENT */
static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp)
void __user *argp, fmode_t mode)
{
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
return ns->head->ns_id;
case NVME_IOCTL_IO_CMD:
return nvme_user_cmd(ns->ctrl, ns, argp);
return nvme_user_cmd(ns->ctrl, ns, argp, mode);
/*
* struct nvme_user_io can have different padding on some 32-bit ABIs.
* Just accept the compat version as all fields that are used are the
@ -620,19 +667,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
case NVME_IOCTL_SUBMIT_IO:
return nvme_submit_io(ns, argp);
case NVME_IOCTL_IO64_CMD:
return nvme_user_cmd64(ns->ctrl, ns, argp, false);
return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode);
case NVME_IOCTL_IO64_CMD_VEC:
return nvme_user_cmd64(ns->ctrl, ns, argp, true);
return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode);
default:
return -ENOTTY;
}
}
static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg)
static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg,
fmode_t mode)
{
if (is_ctrl_ioctl(cmd))
return nvme_ctrl_ioctl(ns->ctrl, cmd, arg);
return nvme_ns_ioctl(ns, cmd, arg);
if (is_ctrl_ioctl(cmd))
return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode);
return nvme_ns_ioctl(ns, cmd, arg, mode);
}
int nvme_ioctl(struct block_device *bdev, fmode_t mode,
@ -640,7 +688,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode,
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
return __nvme_ioctl(ns, cmd, (void __user *)arg);
return __nvme_ioctl(ns, cmd, (void __user *)arg, mode);
}
long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@ -648,7 +696,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct nvme_ns *ns =
container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
return __nvme_ioctl(ns, cmd, (void __user *)arg);
return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode);
}
static int nvme_uring_cmd_checks(unsigned int issue_flags)
@ -716,7 +764,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp, struct nvme_ns_head *head, int srcu_idx)
void __user *argp, struct nvme_ns_head *head, int srcu_idx,
fmode_t mode)
__releases(&head->srcu)
{
struct nvme_ctrl *ctrl = ns->ctrl;
@ -724,7 +773,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
nvme_get_ctrl(ns->ctrl);
srcu_read_unlock(&head->srcu, srcu_idx);
ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode);
nvme_put_ctrl(ctrl);
return ret;
@ -749,9 +798,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd))
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
mode);
ret = nvme_ns_ioctl(ns, cmd, argp);
ret = nvme_ns_ioctl(ns, cmd, argp, mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@ -773,9 +823,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
goto out_unlock;
if (is_ctrl_ioctl(cmd))
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
file->f_mode);
ret = nvme_ns_ioctl(ns, cmd, argp);
ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@ -849,7 +900,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
return ret;
}
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
fmode_t mode)
{
struct nvme_ns *ns;
int ret;
@ -873,7 +925,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
ret = nvme_user_cmd(ctrl, ns, argp);
ret = nvme_user_cmd(ctrl, ns, argp, mode);
nvme_put_ns(ns);
return ret;
@ -890,11 +942,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
return nvme_user_cmd(ctrl, NULL, argp, file->f_mode);
case NVME_IOCTL_ADMIN64_CMD:
return nvme_user_cmd64(ctrl, NULL, argp, false);
return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
return nvme_dev_user_cmd(ctrl, argp, file->f_mode);
case NVME_IOCTL_RESET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;

View File

@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req)
kblockd_schedule_work(&ns->head->requeue_work);
}
void nvme_mpath_start_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns->head->disk;
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
blk_rq_bytes(rq) >> SECTOR_SHIFT,
req_op(rq), jiffies);
}
EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
void nvme_mpath_end_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
nvme_req(rq)->start_time);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
@ -506,6 +531,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
/*
* This assumes all controllers that refer to a namespace either
* support poll queues or not. That is not a strict guarantee,

View File

@ -162,6 +162,9 @@ struct nvme_request {
u8 retries;
u8 flags;
u16 status;
#ifdef CONFIG_NVME_MULTIPATH
unsigned long start_time;
#endif
struct nvme_ctrl *ctrl;
};
@ -173,6 +176,7 @@ struct nvme_request {
enum {
NVME_REQ_CANCELLED = (1 << 0),
NVME_REQ_USERCMD = (1 << 1),
NVME_MPATH_IO_STATS = (1 << 2),
};
static inline struct nvme_request *nvme_req(struct request *req)
@ -237,6 +241,7 @@ enum nvme_ctrl_flags {
NVME_CTRL_FAILFAST_EXPIRED = 0,
NVME_CTRL_ADMIN_Q_STOPPED = 1,
NVME_CTRL_STARTED_ONCE = 2,
NVME_CTRL_STOPPED = 3,
};
struct nvme_ctrl {
@ -336,8 +341,8 @@ struct nvme_ctrl {
#ifdef CONFIG_NVME_AUTH
struct work_struct dhchap_auth_work;
struct list_head dhchap_auth_list;
struct mutex dhchap_auth_mutex;
struct nvme_dhchap_queue_context *dhchap_ctxs;
struct nvme_dhchap_key *host_key;
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
@ -454,6 +459,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
enum nvme_ns_features {
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */
};
struct nvme_ns {
@ -483,11 +489,9 @@ struct nvme_ns {
unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
#define NVME_NS_FORCE_RO 3
#define NVME_NS_READY 4
#define NVME_NS_STOPPED 5
struct cdev cdev;
struct device cdev_device;
@ -508,6 +512,9 @@ struct nvme_ctrl_ops {
unsigned int flags;
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
#define NVME_F_BLOCKING (1 << 2)
const struct attribute_group **dev_attr_groups;
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@ -728,37 +735,32 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl);
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size);
const struct blk_mq_ops *ops, unsigned int cmd_size);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
const struct blk_mq_ops *ops, unsigned int nr_maps,
unsigned int cmd_size);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl);
void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send);
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
volatile union nvme_result *res);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
@ -857,6 +859,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
#ifdef CONFIG_NVME_MULTIPATH
@ -883,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
void nvme_mpath_end_request(struct request *rq);
static inline void nvme_trace_bio_complete(struct request *req)
{
@ -968,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
}
static inline void nvme_mpath_start_request(struct request *rq)
{
}
static inline void nvme_mpath_end_request(struct request *rq)
{
}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_revalidate_zones(struct nvme_ns *ns);
@ -1013,20 +1024,38 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
}
#endif
static inline void nvme_start_request(struct request *rq)
{
if (rq->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_start_request(rq);
blk_mq_start_request(rq);
}
static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
{
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
#ifdef CONFIG_NVME_AUTH
void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
int __init nvme_init_auth(void);
void __exit nvme_exit_auth(void);
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_auth_stop(struct nvme_ctrl *ctrl);
int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
void nvme_auth_reset(struct nvme_ctrl *ctrl);
void nvme_auth_free(struct nvme_ctrl *ctrl);
#else
static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
{
return 0;
}
static inline int __init nvme_init_auth(void)
{
return 0;
}
static inline void __exit nvme_exit_auth(void)
{
}
static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
{

View File

@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kstrtox.h>
#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
@ -108,7 +109,7 @@ struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
static void nvme_delete_io_queues(struct nvme_dev *dev);
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@ -130,7 +131,6 @@ struct nvme_dev {
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
u64 cmb_size;
@ -158,8 +158,6 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
bool attrs_added;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
return dev->nr_allocated_queues * 8 * dev->db_stride;
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
unsigned int mem_size = nvme_dbbuf_size(dev);
if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
return;
if (dev->dbbuf_dbs) {
/*
* Clear the dbbuf memory so the driver doesn't observe stale
@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
*/
memset(dev->dbbuf_dbs, 0, mem_size);
memset(dev->dbbuf_eis, 0, mem_size);
return 0;
return;
}
dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_dbs_dma_addr,
GFP_KERNEL);
if (!dev->dbbuf_dbs)
return -ENOMEM;
goto fail;
dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_eis_dma_addr,
GFP_KERNEL);
if (!dev->dbbuf_eis) {
dma_free_coherent(dev->dev, mem_size,
dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
dev->dbbuf_dbs = NULL;
return -ENOMEM;
}
if (!dev->dbbuf_eis)
goto fail_free_dbbuf_dbs;
return;
return 0;
fail_free_dbbuf_dbs:
dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
dev->dbbuf_dbs_dma_addr);
dev->dbbuf_dbs = NULL;
fail:
dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
}
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void)
PAGE_SIZE);
}
static size_t nvme_pci_iod_alloc_size(void)
{
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
return sizeof(__le64 *) * npages +
sizeof(struct scatterlist) * NVME_MAX_SEGS;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
{
struct nvme_dev *dev = set->driver_data;
struct nvme_dev *dev = to_nvme_dev(set->driver_data);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_req(req)->ctrl = &dev->ctrl;
@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev)
static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_dev *dev = set->driver_data;
struct nvme_dev *dev = to_nvme_dev(set->driver_data);
int i, qoff, offset;
offset = queue_irq_offset(dev);
@ -914,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
goto out_unmap_data;
}
blk_mq_start_request(req);
nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
@ -1474,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
}
}
/**
* nvme_suspend_queue - put queue into suspended state
* @nvmeq: queue to suspend
*/
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
{
struct nvme_queue *nvmeq = &dev->queues[qid];
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
return 1;
return;
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
nvme_stop_admin_queue(&nvmeq->dev->ctrl);
nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
return 0;
pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
}
static void nvme_suspend_io_queues(struct nvme_dev *dev)
@ -1499,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
nvme_suspend_queue(&dev->queues[i]);
}
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
struct nvme_queue *nvmeq = &dev->queues[0];
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
nvme_disable_ctrl(&dev->ctrl);
nvme_poll_irqdisable(nvmeq);
nvme_suspend_queue(dev, i);
}
/*
@ -1748,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
* user requests may be waiting on a stopped queue. Start the
* queue to flush these to completion.
*/
nvme_start_admin_queue(&dev->ctrl);
blk_mq_destroy_queue(dev->ctrl.admin_q);
blk_mq_free_tag_set(&dev->admin_tagset);
nvme_unquiesce_admin_queue(&dev->ctrl);
nvme_remove_admin_tag_set(&dev->ctrl);
}
}
static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
{
struct blk_mq_tag_set *set = &dev->admin_tagset;
set->ops = &nvme_mq_admin_ops;
set->nr_hw_queues = 1;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->timeout = NVME_ADMIN_TIMEOUT;
set->numa_node = dev->ctrl.numa_node;
set->cmd_size = sizeof(struct nvme_iod);
set->flags = BLK_MQ_F_NO_SCHED;
set->driver_data = dev;
if (blk_mq_alloc_tag_set(set))
return -ENOMEM;
dev->ctrl.admin_tagset = set;
dev->ctrl.admin_q = blk_mq_init_queue(set);
if (IS_ERR(dev->ctrl.admin_q)) {
blk_mq_free_tag_set(set);
dev->ctrl.admin_q = NULL;
return -ENOMEM;
}
if (!blk_get_queue(dev->ctrl.admin_q)) {
nvme_dev_remove_admin(dev);
dev->ctrl.admin_q = NULL;
return -ENODEV;
}
return 0;
}
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
@ -1829,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
result = nvme_disable_ctrl(&dev->ctrl);
/*
* If the device has been passed off to us in an enabled state, just
* clear the enabled bit. The spec says we should set the 'shutdown
* notification bits', but doing so may cause the device to complete
* commands to the admin queue ... and we don't know what memory that
* might be pointing at!
*/
result = nvme_disable_ctrl(&dev->ctrl, false);
if (result < 0)
return result;
@ -2112,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u32 enable_bits = NVME_HOST_MEM_ENABLE;
int ret;
if (!dev->ctrl.hmpre)
return 0;
preferred = min(preferred, max);
if (min > max) {
dev_warn(dev->ctrl.device,
@ -2192,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
bool new;
int ret;
if (strtobool(buf, &new) < 0)
if (kstrtobool(buf, &new) < 0)
return -EINVAL;
if (new == ndev->hmb)
@ -2240,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = {
NULL,
};
static const struct attribute_group nvme_pci_attr_group = {
static const struct attribute_group nvme_pci_dev_attrs_group = {
.attrs = nvme_pci_attrs,
.is_visible = nvme_pci_attrs_are_visible,
};
static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
&nvme_dev_attrs_group,
&nvme_pci_dev_attrs_group,
NULL,
};
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@ -2319,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
/*
@ -2432,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
nvme_disable_io_queues(dev);
nvme_delete_io_queues(dev);
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
@ -2495,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
return 0;
}
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
{
int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
@ -2523,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
static void nvme_delete_io_queues(struct nvme_dev *dev)
{
struct blk_mq_tag_set * set = &dev->tagset;
int ret;
if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
__nvme_delete_io_queues(dev, nvme_admin_delete_cq);
}
set->ops = &nvme_mq_ops;
set->nr_hw_queues = dev->online_queues - 1;
set->nr_maps = 1;
if (dev->io_queues[HCTX_TYPE_READ])
set->nr_maps = 2;
static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
{
if (dev->io_queues[HCTX_TYPE_POLL])
set->nr_maps = 3;
set->timeout = NVME_IO_TIMEOUT;
set->numa_node = dev->ctrl.numa_node;
set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
set->cmd_size = sizeof(struct nvme_iod);
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->driver_data = dev;
/*
* Some Apple controllers requires tags to be unique
* across admin and IO queue, so reserve the first 32
* tags of the IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH;
ret = blk_mq_alloc_tag_set(set);
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
return;
}
dev->ctrl.tagset = set;
return 3;
if (dev->io_queues[HCTX_TYPE_READ])
return 2;
return 1;
}
static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
@ -2647,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
return 0;
return nvme_pci_configure_admin_queue(dev);
disable:
pci_disable_device(pdev);
@ -2661,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
pci_release_mem_regions(to_pci_dev(dev->dev));
}
static void nvme_pci_disable(struct nvme_dev *dev)
static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
u32 csts;
pci_free_irq_vectors(pdev);
if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
return true;
if (pdev->error_state != pci_channel_io_normal)
return true;
if (pci_is_enabled(pdev)) {
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
}
csts = readl(dev->bar + NVME_REG_CSTS);
return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
}
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
bool dead;
mutex_lock(&dev->shutdown_lock);
if (pci_is_enabled(pdev)) {
u32 csts;
if (pci_device_is_present(pdev))
csts = readl(dev->bar + NVME_REG_CSTS);
else
csts = ~0;
if (dev->ctrl.state == NVME_CTRL_LIVE ||
dev->ctrl.state == NVME_CTRL_RESETTING) {
freeze = true;
dead = nvme_pci_ctrl_is_dead(dev);
if (dev->ctrl.state == NVME_CTRL_LIVE ||
dev->ctrl.state == NVME_CTRL_RESETTING) {
if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl);
}
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pdev->error_state != pci_channel_io_normal);
/*
* Give the controller a chance to complete all entered requests
* if doing a safe shutdown.
*/
if (!dead && shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}
/*
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown.
*/
if (!dead && shutdown && freeze)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&dev->ctrl);
nvme_quiesce_io_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
nvme_delete_io_queues(dev);
nvme_disable_ctrl(&dev->ctrl, shutdown);
nvme_poll_irqdisable(&dev->queues[0]);
}
nvme_suspend_io_queues(dev);
nvme_suspend_queue(&dev->queues[0]);
nvme_pci_disable(dev);
nvme_suspend_queue(dev, 0);
pci_free_irq_vectors(pdev);
if (pci_is_enabled(pdev)) {
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
}
nvme_reap_pending_cqes(dev);
nvme_cancel_tagset(&dev->ctrl);
@ -2723,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
nvme_start_queues(&dev->ctrl);
nvme_unquiesce_io_queues(&dev->ctrl);
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
nvme_start_admin_queue(&dev->ctrl);
nvme_unquiesce_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
@ -2762,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
size_t alloc_size = sizeof(__le64 *) * npages +
sizeof(struct scatterlist) * NVME_MAX_SEGS;
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1,
mempool_kmalloc, mempool_kfree,
(void *)alloc_size, GFP_KERNEL,
dev_to_node(dev->dev));
if (!dev->iod_mempool)
return -ENOMEM;
return 0;
}
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
nvme_remove_io_tag_set(&dev->ctrl);
dev->ctrl.tagset = NULL;
}
/* pairs with nvme_pci_alloc_dev */
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev);
nvme_free_tagset(dev);
if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q);
free_opal_dev(dev->ctrl.opal_dev);
mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
kfree(dev);
}
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{
/*
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
* may be holding this pci_dev's device lock.
*/
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
nvme_kill_queues(&dev->ctrl);
if (!queue_work(nvme_wq, &dev->remove_work))
nvme_put_ctrl(&dev->ctrl);
}
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
@ -2808,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.state != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
result = -ENODEV;
goto out;
return;
}
/*
@ -2824,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work)
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
result = nvme_pci_configure_admin_queue(dev);
if (result)
goto out_unlock;
if (!dev->ctrl.admin_q) {
result = nvme_pci_alloc_admin_tag_set(dev);
if (result)
goto out_unlock;
} else {
nvme_start_admin_queue(&dev->ctrl);
}
dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
/*
* Limit the max command size to prevent iod->sg allocations going
* over a single page.
*/
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
/*
* Don't limit the IOMMU merged segment size.
*/
dma_set_max_seg_size(dev->dev, 0xffffffff);
nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/*
@ -2865,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
/*
* We do not support an SGL for metadata (yet), so we are limited to a
* single integrity segment for the separate metadata pointer.
*/
dev->ctrl.max_integrity_segments = 1;
result = nvme_init_ctrl_finish(&dev->ctrl);
result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
if (result)
goto out;
if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
if (!dev->ctrl.opal_dev)
dev->ctrl.opal_dev =
init_opal_dev(&dev->ctrl, &nvme_sec_submit);
else if (was_suspend)
opal_unlock_from_suspend(dev->ctrl.opal_dev);
} else {
free_opal_dev(dev->ctrl.opal_dev);
dev->ctrl.opal_dev = NULL;
}
nvme_dbbuf_dma_alloc(dev);
if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
result = nvme_dbbuf_dma_alloc(dev);
if (result)
dev_warn(dev->dev,
"unable to allocate dma for dbbuf\n");
}
if (dev->ctrl.hmpre) {
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out;
}
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out;
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
* Keep the controller around but remove all namespaces if we don't have
* any working I/O queue.
* Freeze and update the number of I/O queues as thos might have
* changed. If there are no I/O queues left after this reset, keep the
* controller around but remove all namespaces.
*/
if (dev->online_queues < 2) {
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_free_tagset(dev);
} else {
nvme_start_queues(&dev->ctrl);
if (dev->online_queues > 1) {
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
if (!dev->ctrl.tagset)
nvme_pci_alloc_tag_set(dev);
else
nvme_pci_update_nr_queues(dev);
nvme_pci_update_nr_queues(dev);
nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
} else {
dev_warn(dev->ctrl.device, "IO queues lost\n");
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_free_tagset(dev);
}
/*
@ -2934,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
&nvme_pci_attr_group))
dev->attrs_added = true;
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
if (result)
dev_warn(dev->ctrl.device,
"Removing after probe failure status: %d\n", result);
nvme_remove_dead_ctrl(dev);
}
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_get_drvdata(pdev))
device_release_driver(&pdev->dev);
nvme_put_ctrl(&dev->ctrl);
/*
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
* may be holding this pci_dev's device lock.
*/
dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
result);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
@ -3010,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED,
.dev_attr_groups = nvme_pci_dev_attr_groups,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
@ -3079,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
static void nvme_async_probe(void *data, async_cookie_t cookie)
static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
const struct pci_device_id *id)
{
struct nvme_dev *dev = data;
flush_work(&dev->ctrl.reset_work);
flush_work(&dev->ctrl.scan_work);
nvme_put_ctrl(&dev->ctrl);
}
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
size_t alloc_size;
int node = dev_to_node(&pdev->dev);
struct nvme_dev *dev;
int ret = -ENOMEM;
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
return NULL;
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
mutex_init(&dev->shutdown_lock);
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
@ -3109,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
goto out_free_dev;
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_dev_map(dev);
if (result)
goto put_pci;
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
result = nvme_setup_prp_pools(dev);
if (result)
goto unmap;
quirks |= check_vendor_combination_bug(pdev);
if (!noacpi && acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
@ -3137,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
if (ret)
goto out_put_device;
dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
dma_set_max_seg_size(&pdev->dev, 0xffffffff);
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
* Limit the max command size to prevent iod->sg allocations going
* over a single page.
*/
alloc_size = nvme_pci_iod_alloc_size();
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
mempool_kfree,
(void *) alloc_size,
GFP_KERNEL, node);
if (!dev->iod_mempool) {
result = -ENOMEM;
goto release_pools;
}
/*
* There is no support for SGLs for metadata (yet), so we are limited to
* a single integrity segment for the separate metadata pointer.
*/
dev->ctrl.max_integrity_segments = 1;
return dev;
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
out_put_device:
put_device(dev->dev);
kfree(dev->queues);
out_free_dev:
kfree(dev);
return ERR_PTR(ret);
}
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct nvme_dev *dev;
int result = -ENOMEM;
dev = nvme_pci_alloc_dev(pdev, id);
if (!dev)
return -ENOMEM;
result = nvme_dev_map(dev);
if (result)
goto release_mempool;
goto out_uninit_ctrl;
result = nvme_setup_prp_pools(dev);
if (result)
goto out_dev_unmap;
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
goto out_release_prp_pools;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_reset_ctrl(&dev->ctrl);
async_schedule(nvme_async_probe, dev);
result = nvme_pci_enable(dev);
if (result)
goto out_release_iod_mempool;
result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
&nvme_mq_admin_ops, sizeof(struct nvme_iod));
if (result)
goto out_disable;
/*
* Mark the controller as connecting before sending admin commands to
* allow the timeout handler to do the right thing.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller CONNECTING\n");
result = -EBUSY;
goto out_disable;
}
result = nvme_init_ctrl_finish(&dev->ctrl, false);
if (result)
goto out_disable;
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
if (result < 0)
goto out_disable;
result = nvme_setup_io_queues(dev);
if (result)
goto out_disable;
if (dev->online_queues > 1) {
nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
nvme_dbbuf_set(dev);
}
if (!dev->ctrl.tagset)
dev_warn(dev->ctrl.device, "IO queues not created\n");
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
"failed to mark controller live state\n");
result = -ENODEV;
goto out_disable;
}
pci_set_drvdata(pdev, dev);
nvme_start_ctrl(&dev->ctrl);
nvme_put_ctrl(&dev->ctrl);
return 0;
release_mempool:
out_disable:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
release_pools:
out_release_prp_pools:
nvme_release_prp_pools(dev);
unmap:
out_dev_unmap:
nvme_dev_unmap(dev);
put_pci:
put_device(dev->dev);
free:
kfree(dev->queues);
kfree(dev);
out_uninit_ctrl:
nvme_uninit_ctrl(&dev->ctrl);
return result;
}
@ -3208,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
static void nvme_remove_attrs(struct nvme_dev *dev)
{
if (dev->attrs_added)
sysfs_remove_group(&dev->ctrl.device->kobj,
&nvme_pci_attr_group);
}
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@ -3236,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
@ -3576,11 +3505,12 @@ static struct pci_driver nvme_driver = {
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
#ifdef CONFIG_PM_SLEEP
.driver = {
.pm = &nvme_dev_pm_ops,
},
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
#ifdef CONFIG_PM_SLEEP
.pm = &nvme_dev_pm_ops,
#endif
},
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};

Some files were not shown because too many files have changed in this diff Show More